From e181090480b14079a71bd6670d432c054dad64a5 Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Sat, 9 May 2026 02:09:33 -0400 Subject: [PATCH 1/6] Wire SessionStorage into MCPRemoteProxy for HA support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MCPRemoteProxy runs with multiple replicas behind a load balancer that doesn't preserve client-IP affinity (e.g. AWS ALB across multiple AZs), every non-initialize request fails with `Session not found` because the transparent proxy validates `Mcp-Session-Id` against pod-local in-memory state on every hop. From transparent_proxy.go: // Guard: reject non-initialize requests with unknown session IDs. // When multiple proxyrunner replicas share a Redis session store, // a valid session will always be found. The transport layer already supports a Redis-backed session store via runner.ScalingConfig.SessionRedis — MCPServer and VirtualMCPServer wire it through. MCPRemoteProxy simply never populated it. This change ports the symmetric work from MCPServer (PR #4368) and VirtualMCPServer (PR #4367) to MCPRemoteProxy: - Add MCPRemoteProxySpec.SessionStorage field (same SessionStorageConfig shape used by MCPServer / VirtualMCPServer) - populateScalingConfigForRemoteProxy: write the non-sensitive Redis parameters (address/db/keyPrefix) into runner.ScalingConfig.SessionRedis - buildRedisPasswordEnvVarForRemoteProxy: inject THV_SESSION_REDIS_PASSWORD on the proxy Deployment via SecretKeyRef when sessionStorage.passwordRef is set, so the password never lands in the RunConfig ConfigMap Tests: - TestPopulateScalingConfigForRemoteProxy mirrors TestPopulateScalingConfig from mcpserver_runconfig_test.go (4 cases including a check that the password never leaks into the serialized SessionRedis) - TestBuildRedisPasswordEnvVarForRemoteProxy mirrors TestBuildRedisPasswordEnvVar from virtualmcpserver_deployment_test.go (4 cases covering the matrix of nil/memory/redis-no-pwd/redis-with-pwd) Generated: - zz_generated.deepcopy.go (controller-gen v0.17.3) - toolhive.stacklok.dev_mcpremoteproxies.yaml CRD schema (controller-gen v0.17.3) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/v1beta1/mcpremoteproxy_types.go | 16 +++ .../api/v1beta1/zz_generated.deepcopy.go | 5 + .../controllers/mcpremoteproxy_deployment.go | 30 +++++ .../mcpremoteproxy_deployment_test.go | 64 ++++++++++ .../controllers/mcpremoteproxy_runconfig.go | 28 +++++ .../mcpremoteproxy_runconfig_test.go | 86 +++++++++++++ ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 114 ++++++++++++++++++ 7 files changed, 343 insertions(+) diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 3b38d591a7..d19c60b49f 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -136,6 +136,22 @@ type MCPRemoteProxySpec struct { // +kubebuilder:default=ClientIP // +optional SessionAffinity string `json:"sessionAffinity,omitempty"` + + // SessionStorage configures session storage for stateful horizontal scaling. + // When nil, no session storage is configured and the proxy falls back to + // pod-local in-memory session state — incompatible with multi-replica + // deployments behind load balancers that don't preserve client-IP affinity + // (e.g. AWS ALB across multiple AZs). + // + // The transparent proxy validates `Mcp-Session-Id` against this store on + // every non-initialize request (see pkg/transport/proxy/transparent/ + // transparent_proxy.go) and rewrites client-facing session IDs to backend + // session IDs using session metadata. Both lookups require shared state + // across replicas. + // + // Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + // +optional + SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty"` } // MCPRemoteProxyStatus defines the observed state of MCPRemoteProxy diff --git a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go index b27e168797..92f69061a9 100644 --- a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go +++ b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go @@ -1284,6 +1284,11 @@ func (in *MCPRemoteProxySpec) DeepCopyInto(out *MCPRemoteProxySpec) { *out = new(MCPGroupRef) **out = **in } + if in.SessionStorage != nil { + in, out := &in.SessionStorage, &out.SessionStorage + *out = new(SessionStorageConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MCPRemoteProxySpec. diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go index bff48386a3..b2682ea8fc 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go @@ -18,6 +18,7 @@ import ( ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" "github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum" "github.com/stacklok/toolhive/pkg/container/kubernetes" + "github.com/stacklok/toolhive/pkg/transport/session" ) // deploymentForMCPRemoteProxy returns a MCPRemoteProxy Deployment object @@ -232,9 +233,38 @@ func (r *MCPRemoteProxyReconciler) buildEnvVarsForProxy( } } + // Add THV_SESSION_REDIS_PASSWORD when sessionStorage uses a passwordRef. + // The non-sensitive parts (address/db/keyPrefix) are populated into the + // runconfig by populateScalingConfigForRemoteProxy; the password is + // injected separately so it never lands in the ConfigMap. + env = append(env, buildRedisPasswordEnvVarForRemoteProxy(proxy)...) + return ctrlutil.EnsureRequiredEnvVars(ctx, env) } +// buildRedisPasswordEnvVarForRemoteProxy returns the THV_SESSION_REDIS_PASSWORD +// env var sourced from spec.sessionStorage.passwordRef when sessionStorage uses +// the redis provider; returns nil otherwise. Mirrors VirtualMCPServer's +// buildRedisPasswordEnvVar in virtualmcpserver_deployment.go. +func buildRedisPasswordEnvVarForRemoteProxy(proxy *mcpv1beta1.MCPRemoteProxy) []corev1.EnvVar { + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis || + proxy.Spec.SessionStorage.PasswordRef == nil { + return nil + } + return []corev1.EnvVar{{ + Name: session.RedisPasswordEnvVar, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: proxy.Spec.SessionStorage.PasswordRef.Name, + }, + Key: proxy.Spec.SessionStorage.PasswordRef.Key, + }, + }, + }} +} + // buildOIDCClientSecretEnvVars returns OIDC client secret env vars when the proxy // references an MCPOIDCConfig with an inline client secret. Returns nil otherwise. func (r *MCPRemoteProxyReconciler) buildOIDCClientSecretEnvVars( diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go index 91d2e8e462..d65b18628c 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go @@ -30,6 +30,7 @@ import ( mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + "github.com/stacklok/toolhive/pkg/transport/session" ) // TestDeploymentForMCPRemoteProxy tests deployment generation @@ -1305,3 +1306,66 @@ func TestMCPRemoteProxyServiceNeedsUpdate(t *testing.T) { }) } } + +// TestBuildRedisPasswordEnvVarForRemoteProxy mirrors VirtualMCPServer's +// TestBuildRedisPasswordEnvVar — the env var must be injected only when +// sessionStorage uses the redis provider AND a passwordRef is set, and it +// must always be a SecretKeyRef (never a plaintext value). +func TestBuildRedisPasswordEnvVarForRemoteProxy(t *testing.T) { + t.Parallel() + + passwordRef := &mcpv1beta1.SecretKeyRef{Name: "redis-secret", Key: "password"} + + tests := []struct { + name string + storage *mcpv1beta1.SessionStorageConfig + expectEnVar bool + }{ + { + name: "nil sessionStorage produces no env var", + storage: nil, + expectEnVar: false, + }, + { + name: "memory provider produces no env var", + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + expectEnVar: false, + }, + { + name: "redis without passwordRef produces no env var", + storage: &mcpv1beta1.SessionStorageConfig{Provider: mcpv1beta1.SessionStorageProviderRedis, Address: "redis:6379"}, + expectEnVar: false, + }, + { + name: "redis with passwordRef produces THV_SESSION_REDIS_PASSWORD", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + PasswordRef: passwordRef, + }, + expectEnVar: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + proxy := &mcpv1beta1.MCPRemoteProxy{ + ObjectMeta: metav1.ObjectMeta{Name: "test-proxy", Namespace: "default"}, + Spec: mcpv1beta1.MCPRemoteProxySpec{SessionStorage: tc.storage}, + } + env := buildRedisPasswordEnvVarForRemoteProxy(proxy) + if tc.expectEnVar { + require.Len(t, env, 1) + assert.Equal(t, session.RedisPasswordEnvVar, env[0].Name) + assert.Empty(t, env[0].Value, "must not use plaintext Value") + require.NotNil(t, env[0].ValueFrom) + require.NotNil(t, env[0].ValueFrom.SecretKeyRef) + assert.Equal(t, passwordRef.Name, env[0].ValueFrom.SecretKeyRef.Name) + assert.Equal(t, passwordRef.Key, env[0].ValueFrom.SecretKeyRef.Key) + } else { + assert.Empty(t, env) + } + }) + } +} diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go index 65cf8cb81f..f026071f4f 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go @@ -173,9 +173,37 @@ func (r *MCPRemoteProxyReconciler) createRunConfigFromMCPRemoteProxy( return nil, fmt.Errorf("failed to populate middleware configs: %w", err) } + // Populate ScalingConfig.SessionRedis from spec.sessionStorage so the + // proxy runner has the address/db/keyPrefix needed to construct a + // shared Redis-backed session store. The Redis password is intentionally + // excluded here — it is injected as the THV_SESSION_REDIS_PASSWORD env + // var by buildRedisPasswordEnvVar in mcpremoteproxy_deployment.go. + populateScalingConfigForRemoteProxy(runConfig, proxy) + return runConfig, nil } +// populateScalingConfigForRemoteProxy mirrors populateScalingConfig from +// mcpserver_runconfig.go but for MCPRemoteProxy (which has no +// BackendReplicas concept). When MCPRemoteProxy.spec.sessionStorage uses +// the redis provider, this populates runner.ScalingConfig.SessionRedis with +// the non-sensitive connection parameters. +func populateScalingConfigForRemoteProxy(runConfig *runner.RunConfig, proxy *mcpv1beta1.MCPRemoteProxy) { + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis { + return + } + + if runConfig.ScalingConfig == nil { + runConfig.ScalingConfig = &runner.ScalingConfig{} + } + runConfig.ScalingConfig.SessionRedis = &runner.SessionRedisConfig{ + Address: proxy.Spec.SessionStorage.Address, + DB: proxy.Spec.SessionStorage.DB, + KeyPrefix: proxy.Spec.SessionStorage.KeyPrefix, + } +} + // resolveAndAddOIDCConfig resolves OIDC configuration from the shared MCPOIDCConfigRef, // adds the appropriate runner options, and returns the resolved config. func (r *MCPRemoteProxyReconciler) resolveAndAddOIDCConfig( diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go index 40ee973f4b..73e7fe874a 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go @@ -856,3 +856,89 @@ func TestLabelsForRunConfigRemoteProxy(t *testing.T) { result := labelsForRunConfigRemoteProxy("test-proxy") assert.Equal(t, expected, result) } + +// TestPopulateScalingConfigForRemoteProxy tests SessionRedis injection into RunConfig +// from MCPRemoteProxy.spec.sessionStorage. Mirrors TestPopulateScalingConfig in +// mcpserver_runconfig_test.go. MCPRemoteProxy has no BackendReplicas concept (no +// backend StatefulSet), so we only cover the SessionRedis path. +func TestPopulateScalingConfigForRemoteProxy(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + storage *mcpv1beta1.SessionStorageConfig + expected func(t *testing.T, sc *runner.ScalingConfig) + }{ + { + name: "nil sessionStorage — ScalingConfig stays nil", + storage: nil, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + assert.Nil(t, sc) + }, + }, + { + name: "memory provider — ScalingConfig stays nil", + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + assert.Nil(t, sc) + }, + }, + { + name: "redis — address/db/keyPrefix written to SessionRedis", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis.default.svc:6379", + DB: 2, + KeyPrefix: "thv:", + }, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + require.NotNil(t, sc) + require.NotNil(t, sc.SessionRedis) + assert.Equal(t, "redis.default.svc:6379", sc.SessionRedis.Address) + assert.Equal(t, int32(2), sc.SessionRedis.DB) + assert.Equal(t, "thv:", sc.SessionRedis.KeyPrefix) + }, + }, + { + name: "redis with passwordRef — password NOT in SessionRedis", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + PasswordRef: &mcpv1beta1.SecretKeyRef{ + Name: "redis-secret", + Key: "password", + }, + }, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + require.NotNil(t, sc) + require.NotNil(t, sc.SessionRedis) + assert.Equal(t, "redis:6379", sc.SessionRedis.Address) + // Password must NOT leak into the RunConfig — it's injected + // separately as the THV_SESSION_REDIS_PASSWORD pod env var + // by buildRedisPasswordEnvVarForRemoteProxy. + data, err := json.Marshal(sc) + require.NoError(t, err) + assert.NotContains(t, string(data), "redis-secret") + assert.NotContains(t, string(data), "password") + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + runConfig := &runner.RunConfig{} + proxy := &mcpv1beta1.MCPRemoteProxy{ + Spec: mcpv1beta1.MCPRemoteProxySpec{ + SessionStorage: tt.storage, + }, + } + populateScalingConfigForRemoteProxy(runConfig, proxy) + tt.expected(t, runConfig.ScalingConfig) + }) + } +} diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 21e6ee87a7..aaed10da23 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -435,6 +435,63 @@ spec: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. @@ -1018,6 +1075,63 @@ spec: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. From c52a373bbd29081d4bc640acf4ce1e6229949130 Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Sat, 9 May 2026 02:24:23 -0400 Subject: [PATCH 2/6] chore: manifests --- ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 114 ++++++++++++++++++ docs/operator/crd-api.md | 2 + 2 files changed, 116 insertions(+) diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index ac0ea398db..644a521b9d 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -438,6 +438,63 @@ spec: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. @@ -1021,6 +1078,63 @@ spec: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index 44e36106de..1acc962df7 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -2032,6 +2032,7 @@ _Appears in:_ | `resourceOverrides` _[api.v1beta1.ResourceOverrides](#apiv1beta1resourceoverrides)_ | ResourceOverrides allows overriding annotations and labels for resources created by the operator | | Optional: \{\}
| | `groupRef` _[api.v1beta1.MCPGroupRef](#apiv1beta1mcpgroupref)_ | GroupRef references the MCPGroup this proxy belongs to.
The referenced MCPGroup must be in the same namespace. | | Optional: \{\}
| | `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity. | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| +| `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.
When nil, no session storage is configured and the proxy falls back to
pod-local in-memory session state — incompatible with multi-replica
deployments behind load balancers that don't preserve client-IP affinity
(e.g. AWS ALB across multiple AZs).
The transparent proxy validates `Mcp-Session-Id` against this store on
every non-initialize request (see pkg/transport/proxy/transparent/
transparent_proxy.go) and rewrites client-facing session IDs to backend
session IDs using session metadata. Both lookups require shared state
across replicas.
Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. | | Optional: \{\}
| #### api.v1beta1.MCPRemoteProxyStatus @@ -3093,6 +3094,7 @@ into the vMCP ConfigMap so the vMCP process receives connection parameters at st _Appears in:_ +- [api.v1beta1.MCPRemoteProxySpec](#apiv1beta1mcpremoteproxyspec) - [api.v1beta1.MCPServerSpec](#apiv1beta1mcpserverspec) - [api.v1beta1.VirtualMCPServerSpec](#apiv1beta1virtualmcpserverspec) From 4c065e7d3acac51d88d8d8bf2641f419c53dcfce Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Fri, 12 Jun 2026 07:34:32 +0100 Subject: [PATCH 3/6] Populate scaling config before middleware configs Address review feedback on #5237: PopulateMiddlewareConfigs -> addRateLimitMiddleware reads ScalingConfig.SessionRedis and errors when it is nil while a rate-limit config is present. MCPRemoteProxy has no rate limiting yet, but the reversed ordering diverged from the documented invariant in mcpserver_runconfig.go and would break as soon as rate limiting is wired in. Mirror the MCPServer ordering. Co-Authored-By: Claude Fable 5 --- .../controllers/mcpremoteproxy_runconfig.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go index 8e26dd352d..997ec22923 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go @@ -168,19 +168,20 @@ func (r *MCPRemoteProxyReconciler) createRunConfigFromMCPRemoteProxy( return nil, err } - // Populate middleware configs from the configuration fields - // This ensures that middleware_configs is properly set for serialization - if err := runner.PopulateMiddlewareConfigs(runConfig); err != nil { - return nil, fmt.Errorf("failed to populate middleware configs: %w", err) - } - // Populate ScalingConfig.SessionRedis from spec.sessionStorage so the // proxy runner has the address/db/keyPrefix needed to construct a // shared Redis-backed session store. The Redis password is intentionally // excluded here — it is injected as the THV_SESSION_REDIS_PASSWORD env // var by buildRedisPasswordEnvVar in mcpremoteproxy_deployment.go. + // Must run before PopulateMiddlewareConfigs because rate limiting reads SessionRedis. populateScalingConfigForRemoteProxy(runConfig, proxy) + // Populate middleware configs from the configuration fields + // This ensures that middleware_configs is properly set for serialization + if err := runner.PopulateMiddlewareConfigs(runConfig); err != nil { + return nil, fmt.Errorf("failed to populate middleware configs: %w", err) + } + return runConfig, nil } From 9364db044f587c7437a810eda2be1589c0bf3800 Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Fri, 12 Jun 2026 07:34:41 +0100 Subject: [PATCH 4/6] Document sessionAffinity/sessionStorage interplay Address review feedback on #5237: the two fields coexisted with no cross-reference. Redis-backed sessionStorage only delivers HA when sessionAffinity is None, and sessionAffinity None without Redis is the exact Session-not-found failure this feature fixes. Spell out both directions in the field comments so the guidance surfaces in the CRD schema and kubectl explain. Generated with controller-gen v0.17.3 and crd-ref-docs. Co-Authored-By: Claude Fable 5 --- .../api/v1beta1/mcpremoteproxy_types.go | 11 ++++++++++ ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 22 +++++++++++++++++++ ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 22 +++++++++++++++++++ docs/operator/crd-api.md | 4 ++-- 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index c2d6b5e5ea..a89d6b0943 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -138,6 +138,13 @@ type MCPRemoteProxySpec struct { // SessionAffinity controls whether the Service routes repeated client connections to the same pod. // MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. // Set to "None" for stateless servers or when using an external load balancer with its own affinity. + // + // Interaction with sessionStorage: when running multiple replicas with + // sessionStorage.provider "redis", set this to "None" so requests are + // distributed across replicas and sessions resolve via the shared store. + // Conversely, "None" without Redis-backed sessionStorage breaks session + // continuity — any request landing on a different pod fails with + // "Session not found". // +kubebuilder:validation:Enum=ClientIP;None // +kubebuilder:default=ClientIP // +optional @@ -155,6 +162,10 @@ type MCPRemoteProxySpec struct { // session IDs using session metadata. Both lookups require shared state // across replicas. // + // When using the Redis provider, also set sessionAffinity to "None" so the + // Service routes requests round-robin and all replicas rely on the shared + // session store rather than pod-local state. + // // Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. // +optional SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty"` diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 6f75ce25c3..6a5eb13876 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -475,6 +475,13 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None @@ -493,6 +500,10 @@ spec: session IDs using session metadata. Both lookups require shared state across replicas. + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. properties: address: @@ -1157,6 +1168,13 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None @@ -1175,6 +1193,10 @@ spec: session IDs using session metadata. Both lookups require shared state across replicas. + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. properties: address: diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index 6cf2b1bc00..59d787f44b 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -478,6 +478,13 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None @@ -496,6 +503,10 @@ spec: session IDs using session metadata. Both lookups require shared state across replicas. + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. properties: address: @@ -1160,6 +1171,13 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None @@ -1178,6 +1196,10 @@ spec: session IDs using session metadata. Both lookups require shared state across replicas. + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. properties: address: diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index 9812d3f508..9631bcec00 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -2170,8 +2170,8 @@ _Appears in:_ | `endpointPrefix` _string_ | EndpointPrefix is the path prefix to prepend to SSE endpoint URLs.
This is used to handle path-based ingress routing scenarios where the ingress
strips a path prefix before forwarding to the backend. | | Optional: \{\}
| | `resourceOverrides` _[api.v1beta1.ResourceOverrides](#apiv1beta1resourceoverrides)_ | ResourceOverrides allows overriding annotations and labels for resources created by the operator | | Optional: \{\}
| | `groupRef` _[api.v1beta1.MCPGroupRef](#apiv1beta1mcpgroupref)_ | GroupRef references the MCPGroup this proxy belongs to.
The referenced MCPGroup must be in the same namespace. | | Optional: \{\}
| -| `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity. | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| -| `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.
When nil, no session storage is configured and the proxy falls back to
pod-local in-memory session state — incompatible with multi-replica
deployments behind load balancers that don't preserve client-IP affinity
(e.g. AWS ALB across multiple AZs).
The transparent proxy validates `Mcp-Session-Id` against this store on
every non-initialize request (see pkg/transport/proxy/transparent/
transparent_proxy.go) and rewrites client-facing session IDs to backend
session IDs using session metadata. Both lookups require shared state
across replicas.
Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. | | Optional: \{\}
| +| `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity.
Interaction with sessionStorage: when running multiple replicas with
sessionStorage.provider "redis", set this to "None" so requests are
distributed across replicas and sessions resolve via the shared store.
Conversely, "None" without Redis-backed sessionStorage breaks session
continuity — any request landing on a different pod fails with
"Session not found". | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| +| `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.
When nil, no session storage is configured and the proxy falls back to
pod-local in-memory session state — incompatible with multi-replica
deployments behind load balancers that don't preserve client-IP affinity
(e.g. AWS ALB across multiple AZs).
The transparent proxy validates `Mcp-Session-Id` against this store on
every non-initialize request (see pkg/transport/proxy/transparent/
transparent_proxy.go) and rewrites client-facing session IDs to backend
session IDs using session metadata. Both lookups require shared state
across replicas.
When using the Redis provider, also set sessionAffinity to "None" so the
Service routes requests round-robin and all replicas rely on the shared
session store rather than pod-local state.
Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. | | Optional: \{\}
| #### api.v1beta1.MCPRemoteProxyStatus From d6264234d197751a37cb033e7e7786e8752e4ba2 Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Fri, 12 Jun 2026 19:49:52 +0100 Subject: [PATCH 5/6] Add spec.replicas to MCPRemoteProxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MCPRemoteProxy was the only scalable type without a replicas field: the operator hardcoded 1 on create, forcing operators to pin counts with a min=max HPA workaround. Mirror the MCPServer/VirtualMCPServer semantics so the HA story matches sessionStorage: - Replicas *int32 on the spec; nil leaves Deployment.Spec.Replicas unset (apiserver default 1, HPA-compatible), non-nil is operator authoritative — set on create, drift-checked in deploymentNeedsUpdate, synced on update while nil preserves the live count. - Surface the shared SessionStorageWarning condition when replicas > 1 without Redis session storage (status condition parity with MCPServer and VirtualMCPServer). - Tests cover create, drift detection, the ensureDeployment preserve-vs-sync paths against a fake cluster, and the condition matrix. CRDs and docs regenerated (controller-gen v0.17.3). Co-Authored-By: Claude Fable 5 --- .../api/v1beta1/mcpremoteproxy_types.go | 12 + .../api/v1beta1/zz_generated.deepcopy.go | 5 + .../controllers/mcpremoteproxy_controller.go | 53 +++- .../controllers/mcpremoteproxy_deployment.go | 6 +- .../mcpremoteproxy_deployment_test.go | 2 +- .../mcpremoteproxy_replicas_test.go | 255 ++++++++++++++++++ ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 26 ++ ...oolhive.stacklok.dev_mcpremoteproxies.yaml | 26 ++ docs/operator/crd-api.md | 1 + 9 files changed, 382 insertions(+), 4 deletions(-) create mode 100644 cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 76a7f53c38..2a3aa6e6e9 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -170,6 +170,18 @@ type MCPRemoteProxySpec struct { // +optional SessionAffinity string `json:"sessionAffinity,omitempty"` + // Replicas is the desired number of proxy pod replicas. + // MCPRemoteProxy creates a single Deployment for the proxy process, so there + // is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + // When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + // management to an HPA or other external controller. + // When set above 1, also configure sessionStorage with the redis provider and + // sessionAffinity: "None" so sessions resolve across replicas; otherwise a + // SessionStorageWarning condition is surfaced on the resource status. + // +kubebuilder:validation:Minimum=0 + // +optional + Replicas *int32 `json:"replicas,omitempty"` + // SessionStorage configures session storage for stateful horizontal scaling. // When nil, no session storage is configured and the proxy falls back to // pod-local in-memory session state — incompatible with multi-replica diff --git a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go index 771cf2ff52..4c82672d70 100644 --- a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go +++ b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go @@ -1457,6 +1457,11 @@ func (in *MCPRemoteProxySpec) DeepCopyInto(out *MCPRemoteProxySpec) { *out = new(MCPGroupRef) **out = **in } + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } if in.SessionStorage != nil { in, out := &in.SessionStorage, &out.SessionStorage *out = new(SessionStorageConfig) diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_controller.go b/cmd/thv-operator/controllers/mcpremoteproxy_controller.go index ce18e84019..fa0336f0d0 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_controller.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_controller.go @@ -127,6 +127,9 @@ func (r *MCPRemoteProxyReconciler) validateAndHandleConfigs(ctx context.Context, // Surface advisory condition when primaryUpstreamProvider is set but ignored r.validateAuthzPrimaryUpstreamProviderIgnored(proxy) + // Surface advisory condition when replicas > 1 without Redis session storage + r.validateSessionStorageForReplicas(proxy) + // Handle MCPToolConfig if err := r.handleToolConfig(ctx, proxy); err != nil { ctxLogger.Error(err, "Failed to handle MCPToolConfig") @@ -301,10 +304,15 @@ func (r *MCPRemoteProxyReconciler) ensureDeployment( if newDeployment == nil { return ctrl.Result{}, fmt.Errorf("failed to create updated Deployment object") } - // Update the deployment spec but preserve replica count for HPA compatibility + // Update template and metadata. Also sync Spec.Replicas when spec.replicas + // is non-nil (operator authoritative); preserve it when nil so an HPA or + // other external controller can manage scaling. deployment.Spec.Template = newDeployment.Spec.Template deployment.Labels = newDeployment.Labels deployment.Annotations = ctrlutil.MergeAnnotations(newDeployment.Annotations, deployment.Annotations) + if newDeployment.Spec.Replicas != nil { + deployment.Spec.Replicas = newDeployment.Spec.Replicas + } ctxLogger.Info("Updating Deployment", "Deployment.Namespace", deployment.Namespace, "Deployment.Name", deployment.Name) if err := r.Update(ctx, deployment); err != nil { @@ -1107,6 +1115,40 @@ func (*MCPRemoteProxyReconciler) validateAuthzPrimaryUpstreamProviderIgnored(pro }) } +// validateSessionStorageForReplicas surfaces a SessionStorageWarning condition +// when replicas > 1 but session storage is not configured with the Redis +// backend. Reconciliation continues regardless; this is advisory only. +// Mirrors the MCPServer and VirtualMCPServer validators so the condition is +// consistent across all types that share the replicas + sessionStorage pair. +// The caller is responsible for persisting status. +func (*MCPRemoteProxyReconciler) validateSessionStorageForReplicas(proxy *mcpv1beta1.MCPRemoteProxy) { + condition := func() metav1.Condition { + if proxy.Spec.Replicas == nil || *proxy.Spec.Replicas <= 1 { + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + Message: "session storage warning is not active", + } + } + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis { + return metav1.Condition{ + Status: metav1.ConditionTrue, + Reason: mcpv1beta1.ConditionReasonSessionStorageMissing, + Message: "replicas > 1 but sessionStorage.provider is not redis; sessions are not shared across replicas", + } + } + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: mcpv1beta1.ConditionReasonSessionStorageConfigured, + Message: "Redis session storage is configured", + } + }() + condition.Type = mcpv1beta1.ConditionSessionStorageWarning + condition.ObservedGeneration = proxy.Generation + meta.SetStatusCondition(&proxy.Status.Conditions, condition) +} + // ensureRBACResources ensures that the RBAC resources are in place for the remote proxy. // Uses the RBAC client (pkg/kubernetes/rbac) which creates or updates RBAC resources // automatically during operator upgrades. @@ -1300,6 +1342,15 @@ func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate( return true } + // Check if spec.replicas has changed. Only compare when spec.replicas is + // non-nil; nil means hands-off mode (HPA or another external controller + // manages replicas) and the live count is authoritative. + if proxy.Spec.Replicas != nil { + if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != *proxy.Spec.Replicas { + return true + } + } + return false } diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go index e3df1b0489..a6237ca28c 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go @@ -27,7 +27,6 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy( ctx context.Context, proxy *mcpv1beta1.MCPRemoteProxy, runConfigChecksum string, ) *appsv1.Deployment { ls := labelsForMCPRemoteProxy(proxy.Name) - replicas := int32(1) // Build deployment components using helper functions args := r.buildContainerArgs() @@ -63,7 +62,10 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy( Annotations: deploymentAnnotations, }, Spec: appsv1.DeploymentSpec{ - Replicas: &replicas, + // nil leaves the replica count to the apiserver default (1) on create + // and to an HPA or other external controller thereafter; non-nil is + // operator-owned and reconciled by deploymentNeedsUpdate. + Replicas: proxy.Spec.Replicas, Selector: &metav1.LabelSelector{ MatchLabels: ls, }, diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go index 52b88fb6f6..5ad27823df 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go @@ -58,7 +58,7 @@ func TestDeploymentForMCPRemoteProxy(t *testing.T) { t.Helper() assert.Equal(t, "basic-proxy", dep.Name) assert.Equal(t, "default", dep.Namespace) - assert.Equal(t, int32(1), *dep.Spec.Replicas) + assert.Nil(t, dep.Spec.Replicas, "nil spec.replicas leaves the count to the apiserver default") // Verify labels assert.Equal(t, labelsForMCPRemoteProxy("basic-proxy"), dep.Spec.Selector.MatchLabels) diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go new file mode 100644 index 0000000000..6ff30a5207 --- /dev/null +++ b/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go @@ -0,0 +1,255 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" + "github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum" +) + +// replicasTestScheme registers the types the replica tests seed into the fake +// client: the CRDs plus core and apps (Deployments). +func replicasTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, appsv1.AddToScheme(scheme)) + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + return scheme +} + +func replicasTestProxy(replicas *int32) *mcpv1beta1.MCPRemoteProxy { + return &mcpv1beta1.MCPRemoteProxy{ + ObjectMeta: metav1.ObjectMeta{Name: "replicas-proxy", Namespace: "default"}, + Spec: mcpv1beta1.MCPRemoteProxySpec{ + RemoteURL: "https://mcp.example.com", + ProxyPort: 8080, + Replicas: replicas, + }, + } +} + +// TestDeploymentForMCPRemoteProxyReplicas verifies spec.replicas flows into +// the generated Deployment: nil stays nil (apiserver defaults to 1, and an +// HPA can manage the count thereafter), non-nil is set verbatim. +func TestDeploymentForMCPRemoteProxyReplicas(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + replicas *int32 + }{ + {name: "nil replicas leaves Deployment.Spec.Replicas unset", replicas: nil}, + {name: "replicas 3 is set on the Deployment", replicas: int32Ptr(3)}, + {name: "replicas 0 is set on the Deployment", replicas: int32Ptr(0)}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + r := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + + dep := r.deploymentForMCPRemoteProxy(context.Background(), replicasTestProxy(tt.replicas), "test-checksum") + require.NotNil(t, dep) + + if tt.replicas == nil { + assert.Nil(t, dep.Spec.Replicas) + } else { + require.NotNil(t, dep.Spec.Replicas) + assert.Equal(t, *tt.replicas, *dep.Spec.Replicas) + } + }) + } +} + +// TestMCPRemoteProxyDeploymentNeedsUpdateReplicas pins the drift semantics: +// non-nil spec.replicas is operator-owned and any divergence triggers an +// update; nil spec.replicas is hands-off so external scaling (HPA) is never +// fought by the reconciler. +func TestMCPRemoteProxyDeploymentNeedsUpdateReplicas(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + specReplicas *int32 + liveReplicas *int32 + wantUpdate bool + }{ + {name: "nil spec ignores externally scaled count", specReplicas: nil, liveReplicas: int32Ptr(5), wantUpdate: false}, + {name: "set spec reconciles drifted count", specReplicas: int32Ptr(3), liveReplicas: int32Ptr(1), wantUpdate: true}, + {name: "set spec matches live count", specReplicas: int32Ptr(3), liveReplicas: int32Ptr(3), wantUpdate: false}, + {name: "set spec with nil live count", specReplicas: int32Ptr(2), liveReplicas: nil, wantUpdate: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + r := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + + proxy := replicasTestProxy(tt.specReplicas) + // Generate the desired deployment so all non-replica comparisons + // in deploymentNeedsUpdate see identical state. + deployment := r.deploymentForMCPRemoteProxy(context.Background(), proxy, "test-checksum") + require.NotNil(t, deployment) + deployment.Spec.Replicas = tt.liveReplicas + + assert.Equal(t, tt.wantUpdate, + r.deploymentNeedsUpdate(context.Background(), deployment, proxy, "test-checksum")) + }) + } +} + +// TestMCPRemoteProxyEnsureDeploymentReplicaSync drives ensureDeployment +// against a fake cluster to verify the update path end to end: a live count +// scaled by an external controller survives when spec.replicas is nil, and is +// overwritten when spec.replicas is set. +func TestMCPRemoteProxyEnsureDeploymentReplicaSync(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + specReplicas *int32 + liveReplicas int32 + wantReplicas int32 + }{ + {name: "nil spec preserves HPA-scaled count", specReplicas: nil, liveReplicas: 5, wantReplicas: 5}, + {name: "set spec overrides drifted count", specReplicas: int32Ptr(3), liveReplicas: 5, wantReplicas: 3}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + proxy := replicasTestProxy(tt.specReplicas) + + // Seed the RunConfig ConfigMap so getRunConfigChecksum resolves the + // same checksum the test passes to the deployment generator. + runConfigCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: proxy.Name + "-runconfig", + Namespace: proxy.Namespace, + Annotations: map[string]string{checksum.ContentChecksumAnnotation: "test-checksum"}, + }, + } + + seedReconciler := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + liveDeployment := seedReconciler.deploymentForMCPRemoteProxy(context.Background(), proxy, "test-checksum") + require.NotNil(t, liveDeployment) + liveDeployment.Spec.Replicas = int32Ptr(tt.liveReplicas) + // Strip the generated owner reference: the fake client rejects + // owner refs to objects it has not assigned a UID. + liveDeployment.OwnerReferences = nil + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(proxy, runConfigCM, liveDeployment). + Build() + r := &MCPRemoteProxyReconciler{Client: fakeClient, Scheme: scheme} + + _, err := r.ensureDeployment(context.Background(), proxy) + require.NoError(t, err) + + got := &appsv1.Deployment{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: proxy.Name, Namespace: proxy.Namespace}, got)) + require.NotNil(t, got.Spec.Replicas) + assert.Equal(t, tt.wantReplicas, *got.Spec.Replicas) + }) + } +} + +// TestValidateSessionStorageForReplicasRemoteProxy mirrors the MCPServer and +// VirtualMCPServer validators: the SessionStorageWarning condition is True +// only when replicas > 1 without Redis-backed session storage. +func TestValidateSessionStorageForReplicasRemoteProxy(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + replicas *int32 + storage *mcpv1beta1.SessionStorageConfig + wantStatus metav1.ConditionStatus + wantReason string + }{ + { + name: "nil replicas is not applicable", + replicas: nil, + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + }, + { + name: "single replica is not applicable", + replicas: int32Ptr(1), + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + }, + { + name: "multiple replicas without session storage warns", + replicas: int32Ptr(3), + wantStatus: metav1.ConditionTrue, + wantReason: mcpv1beta1.ConditionReasonSessionStorageMissing, + }, + { + name: "multiple replicas with memory provider warns", + replicas: int32Ptr(3), + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + wantStatus: metav1.ConditionTrue, + wantReason: mcpv1beta1.ConditionReasonSessionStorageMissing, + }, + { + name: "multiple replicas with redis provider is configured", + replicas: int32Ptr(3), + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + }, + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageConfigured, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + proxy := replicasTestProxy(tt.replicas) + proxy.Spec.SessionStorage = tt.storage + + r := &MCPRemoteProxyReconciler{} + r.validateSessionStorageForReplicas(proxy) + + cond := meta.FindStatusCondition(proxy.Status.Conditions, mcpv1beta1.ConditionSessionStorageWarning) + require.NotNil(t, cond, "SessionStorageWarning condition must always be set") + assert.Equal(t, tt.wantStatus, cond.Status) + assert.Equal(t, tt.wantReason, cond.Reason) + }) + } +} diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 2c7b22f306..777c310691 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -361,6 +361,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -1081,6 +1094,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index bd79a673da..f54568e66e 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -364,6 +364,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -1084,6 +1097,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index 10e613a7fe..9d2c604524 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -2285,6 +2285,7 @@ _Appears in:_ | `resourceOverrides` _[api.v1beta1.ResourceOverrides](#apiv1beta1resourceoverrides)_ | ResourceOverrides allows overriding annotations and labels for resources created by the operator | | Optional: \{\}
| | `groupRef` _[api.v1beta1.MCPGroupRef](#apiv1beta1mcpgroupref)_ | GroupRef references the MCPGroup this proxy belongs to.
The referenced MCPGroup must be in the same namespace. | | Optional: \{\}
| | `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity.
Interaction with sessionStorage: when running multiple replicas with
sessionStorage.provider "redis", set this to "None" so requests are
distributed across replicas and sessions resolve via the shared store.
Conversely, "None" without Redis-backed sessionStorage breaks session
continuity — any request landing on a different pod fails with
"Session not found". | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| +| `replicas` _integer_ | Replicas is the desired number of proxy pod replicas.
MCPRemoteProxy creates a single Deployment for the proxy process, so there
is only one replicas field (mirrors VirtualMCPServer.spec.replicas).
When nil, the operator does not set Deployment.Spec.Replicas, leaving replica
management to an HPA or other external controller.
When set above 1, also configure sessionStorage with the redis provider and
sessionAffinity: "None" so sessions resolve across replicas; otherwise a
SessionStorageWarning condition is surfaced on the resource status. | | Minimum: 0
Optional: \{\}
| | `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.
When nil, no session storage is configured and the proxy falls back to
pod-local in-memory session state — incompatible with multi-replica
deployments behind load balancers that don't preserve client-IP affinity
(e.g. AWS ALB across multiple AZs).
The transparent proxy validates `Mcp-Session-Id` against this store on
every non-initialize request (see pkg/transport/proxy/transparent/
transparent_proxy.go) and rewrites client-facing session IDs to backend
session IDs using session metadata. Both lookups require shared state
across replicas.
When using the Redis provider, also set sessionAffinity to "None" so the
Service routes requests round-robin and all replicas rely on the shared
session store rather than pod-local state.
Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. | | Optional: \{\}
| From 20cfd13da1295a410fa63ef5bf68920e932819b6 Mon Sep 17 00:00:00 2001 From: Aron Gates Date: Fri, 12 Jun 2026 22:03:00 +0100 Subject: [PATCH 6/6] Retrigger CI after known api-workloads flake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The delete-workload spec hit its 60s Eventually timeout — the same flaky spec previously tracked upstream. This PR only touches operator code and generated CRDs, none of which is compiled into the thv binary the api-workloads e2e suite exercises. Co-Authored-By: Claude Fable 5