Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,49 @@ type MCPRemoteProxySpec struct {
// SessionAffinity controls whether the Service routes repeated client connections to the same pod.
// MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
// Set to "None" for stateless servers or when using an external load balancer with its own affinity.
//
// Interaction with sessionStorage: when running multiple replicas with
// sessionStorage.provider "redis", set this to "None" so requests are
// distributed across replicas and sessions resolve via the shared store.
// Conversely, "None" without Redis-backed sessionStorage breaks session
// continuity — any request landing on a different pod fails with
// "Session not found".
// +kubebuilder:validation:Enum=ClientIP;None
// +kubebuilder:default=ClientIP
// +optional
SessionAffinity string `json:"sessionAffinity,omitempty"`

// Replicas is the desired number of proxy pod replicas.
// MCPRemoteProxy creates a single Deployment for the proxy process, so there
// is only one replicas field (mirrors VirtualMCPServer.spec.replicas).
// When nil, the operator does not set Deployment.Spec.Replicas, leaving replica
// management to an HPA or other external controller.
// When set above 1, also configure sessionStorage with the redis provider and
// sessionAffinity: "None" so sessions resolve across replicas; otherwise a
// SessionStorageWarning condition is surfaced on the resource status.
// +kubebuilder:validation:Minimum=0
// +optional
Replicas *int32 `json:"replicas,omitempty"`

// SessionStorage configures session storage for stateful horizontal scaling.
// When nil, no session storage is configured and the proxy falls back to
// pod-local in-memory session state — incompatible with multi-replica
// deployments behind load balancers that don't preserve client-IP affinity
// (e.g. AWS ALB across multiple AZs).
//
// The transparent proxy validates `Mcp-Session-Id` against this store on
// every non-initialize request (see pkg/transport/proxy/transparent/
// transparent_proxy.go) and rewrites client-facing session IDs to backend
// session IDs using session metadata. Both lookups require shared state
// across replicas.
//
// When using the Redis provider, also set sessionAffinity to "None" so the
// Service routes requests round-robin and all replicas rely on the shared
// session store rather than pod-local state.
//
// Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage.
// +optional

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider documenting the sessionAffinity + sessionStorage interaction.

These two fields now coexist without any cross-reference, which creates a footgun. When sessionStorage.provider == redis the correct HA setup also requires sessionAffinity: None — otherwise the k8s Service's ClientIP stickiness makes the Redis store redundant. Conversely, sessionAffinity: None without Redis is the broken configuration this PR was written to fix.

At minimum, the SessionStorage field comment should say: "When using the Redis provider, also set sessionAffinity: None so the Service routes requests round-robin and all replicas rely on the shared session store rather than pod-local state."

A stronger option is a kubebuilder XValidation that requires sessionAffinity == "None" when sessionStorage.provider == "redis", catching the misconfiguration at admission time.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documented in 9364db0 — both field comments now cross-reference each other (regenerated into the CRD schema and crd-api.md). Went with docs-only rather than the CEL rule: ClientIP + redis is still a valid combination (the shared store preserves sessions when a pod dies and affinity re-pins the client), so rejecting it at admission felt too strict.

SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty"`
}

// MCPRemoteProxyStatus defines the observed state of MCPRemoteProxy
Expand Down
10 changes: 10 additions & 0 deletions cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 52 additions & 1 deletion cmd/thv-operator/controllers/mcpremoteproxy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ func (r *MCPRemoteProxyReconciler) validateAndHandleConfigs(ctx context.Context,
// Surface advisory condition when primaryUpstreamProvider is set but ignored
r.validateAuthzPrimaryUpstreamProviderIgnored(proxy)

// Surface advisory condition when replicas > 1 without Redis session storage
r.validateSessionStorageForReplicas(proxy)

// Handle MCPToolConfig
if err := r.handleToolConfig(ctx, proxy); err != nil {
ctxLogger.Error(err, "Failed to handle MCPToolConfig")
Expand Down Expand Up @@ -301,10 +304,15 @@ func (r *MCPRemoteProxyReconciler) ensureDeployment(
if newDeployment == nil {
return ctrl.Result{}, fmt.Errorf("failed to create updated Deployment object")
}
// Update the deployment spec but preserve replica count for HPA compatibility
// Update template and metadata. Also sync Spec.Replicas when spec.replicas
// is non-nil (operator authoritative); preserve it when nil so an HPA or
// other external controller can manage scaling.
deployment.Spec.Template = newDeployment.Spec.Template
deployment.Labels = newDeployment.Labels
deployment.Annotations = ctrlutil.MergeAnnotations(newDeployment.Annotations, deployment.Annotations)
if newDeployment.Spec.Replicas != nil {
deployment.Spec.Replicas = newDeployment.Spec.Replicas
}

ctxLogger.Info("Updating Deployment", "Deployment.Namespace", deployment.Namespace, "Deployment.Name", deployment.Name)
if err := r.Update(ctx, deployment); err != nil {
Expand Down Expand Up @@ -1107,6 +1115,40 @@ func (*MCPRemoteProxyReconciler) validateAuthzPrimaryUpstreamProviderIgnored(pro
})
}

// validateSessionStorageForReplicas surfaces a SessionStorageWarning condition
// when replicas > 1 but session storage is not configured with the Redis
// backend. Reconciliation continues regardless; this is advisory only.
// Mirrors the MCPServer and VirtualMCPServer validators so the condition is
// consistent across all types that share the replicas + sessionStorage pair.
// The caller is responsible for persisting status.
func (*MCPRemoteProxyReconciler) validateSessionStorageForReplicas(proxy *mcpv1beta1.MCPRemoteProxy) {
condition := func() metav1.Condition {
if proxy.Spec.Replicas == nil || *proxy.Spec.Replicas <= 1 {
return metav1.Condition{
Status: metav1.ConditionFalse,
Reason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable,
Message: "session storage warning is not active",
}
}
if proxy.Spec.SessionStorage == nil ||
proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis {
return metav1.Condition{
Status: metav1.ConditionTrue,
Reason: mcpv1beta1.ConditionReasonSessionStorageMissing,
Message: "replicas > 1 but sessionStorage.provider is not redis; sessions are not shared across replicas",
}
}
return metav1.Condition{
Status: metav1.ConditionFalse,
Reason: mcpv1beta1.ConditionReasonSessionStorageConfigured,
Message: "Redis session storage is configured",
}
}()
condition.Type = mcpv1beta1.ConditionSessionStorageWarning
condition.ObservedGeneration = proxy.Generation
meta.SetStatusCondition(&proxy.Status.Conditions, condition)
}

// ensureRBACResources ensures that the RBAC resources are in place for the remote proxy.
// Uses the RBAC client (pkg/kubernetes/rbac) which creates or updates RBAC resources
// automatically during operator upgrades.
Expand Down Expand Up @@ -1300,6 +1342,15 @@ func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate(
return true
}

// Check if spec.replicas has changed. Only compare when spec.replicas is
// non-nil; nil means hands-off mode (HPA or another external controller
// manages replicas) and the live count is authoritative.
if proxy.Spec.Replicas != nil {
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != *proxy.Spec.Replicas {
return true
}
}

return false
}

Expand Down
36 changes: 34 additions & 2 deletions cmd/thv-operator/controllers/mcpremoteproxy_deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil"
"github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum"
"github.com/stacklok/toolhive/pkg/container/kubernetes"
"github.com/stacklok/toolhive/pkg/transport/session"
"github.com/stacklok/toolhive/pkg/vmcp/headerforward/wirefmt"
)

Expand All @@ -26,7 +27,6 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy(
ctx context.Context, proxy *mcpv1beta1.MCPRemoteProxy, runConfigChecksum string,
) *appsv1.Deployment {
ls := labelsForMCPRemoteProxy(proxy.Name)
replicas := int32(1)

// Build deployment components using helper functions
args := r.buildContainerArgs()
Expand Down Expand Up @@ -62,7 +62,10 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy(
Annotations: deploymentAnnotations,
},
Spec: appsv1.DeploymentSpec{
Replicas: &replicas,
// nil leaves the replica count to the apiserver default (1) on create
// and to an HPA or other external controller thereafter; non-nil is
// operator-owned and reconciled by deploymentNeedsUpdate.
Replicas: proxy.Spec.Replicas,
Selector: &metav1.LabelSelector{
MatchLabels: ls,
},
Expand Down Expand Up @@ -234,9 +237,38 @@ func (r *MCPRemoteProxyReconciler) buildEnvVarsForProxy(
}
}

// Add THV_SESSION_REDIS_PASSWORD when sessionStorage uses a passwordRef.
// The non-sensitive parts (address/db/keyPrefix) are populated into the
// runconfig by populateScalingConfigForRemoteProxy; the password is
// injected separately so it never lands in the ConfigMap.
env = append(env, buildRedisPasswordEnvVarForRemoteProxy(proxy)...)

return ctrlutil.EnsureRequiredEnvVars(ctx, env)
}

// buildRedisPasswordEnvVarForRemoteProxy returns the THV_SESSION_REDIS_PASSWORD
// env var sourced from spec.sessionStorage.passwordRef when sessionStorage uses
// the redis provider; returns nil otherwise. Mirrors VirtualMCPServer's
// buildRedisPasswordEnvVar in virtualmcpserver_deployment.go.
func buildRedisPasswordEnvVarForRemoteProxy(proxy *mcpv1beta1.MCPRemoteProxy) []corev1.EnvVar {
if proxy.Spec.SessionStorage == nil ||
proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis ||
proxy.Spec.SessionStorage.PasswordRef == nil {
return nil
}
return []corev1.EnvVar{{
Name: session.RedisPasswordEnvVar,
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{
Name: proxy.Spec.SessionStorage.PasswordRef.Name,
},
Key: proxy.Spec.SessionStorage.PasswordRef.Key,
},
},
}}
}

// buildOIDCClientSecretEnvVars returns OIDC client secret env vars when the proxy
// references an MCPOIDCConfig with an inline client secret. Returns nil otherwise.
func (r *MCPRemoteProxyReconciler) buildOIDCClientSecretEnvVars(
Expand Down
66 changes: 65 additions & 1 deletion cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (

mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil"
"github.com/stacklok/toolhive/pkg/transport/session"
)

// TestDeploymentForMCPRemoteProxy tests deployment generation
Expand Down Expand Up @@ -57,7 +58,7 @@ func TestDeploymentForMCPRemoteProxy(t *testing.T) {
t.Helper()
assert.Equal(t, "basic-proxy", dep.Name)
assert.Equal(t, "default", dep.Namespace)
assert.Equal(t, int32(1), *dep.Spec.Replicas)
assert.Nil(t, dep.Spec.Replicas, "nil spec.replicas leaves the count to the apiserver default")

// Verify labels
assert.Equal(t, labelsForMCPRemoteProxy("basic-proxy"), dep.Spec.Selector.MatchLabels)
Expand Down Expand Up @@ -1306,3 +1307,66 @@ func TestMCPRemoteProxyServiceNeedsUpdate(t *testing.T) {
})
}
}

// TestBuildRedisPasswordEnvVarForRemoteProxy mirrors VirtualMCPServer's
// TestBuildRedisPasswordEnvVar — the env var must be injected only when
// sessionStorage uses the redis provider AND a passwordRef is set, and it
// must always be a SecretKeyRef (never a plaintext value).
func TestBuildRedisPasswordEnvVarForRemoteProxy(t *testing.T) {
t.Parallel()

passwordRef := &mcpv1beta1.SecretKeyRef{Name: "redis-secret", Key: "password"}

tests := []struct {
name string
storage *mcpv1beta1.SessionStorageConfig
expectEnVar bool
}{
{
name: "nil sessionStorage produces no env var",
storage: nil,
expectEnVar: false,
},
{
name: "memory provider produces no env var",
storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"},
expectEnVar: false,
},
{
name: "redis without passwordRef produces no env var",
storage: &mcpv1beta1.SessionStorageConfig{Provider: mcpv1beta1.SessionStorageProviderRedis, Address: "redis:6379"},
expectEnVar: false,
},
{
name: "redis with passwordRef produces THV_SESSION_REDIS_PASSWORD",
storage: &mcpv1beta1.SessionStorageConfig{
Provider: mcpv1beta1.SessionStorageProviderRedis,
Address: "redis:6379",
PasswordRef: passwordRef,
},
expectEnVar: true,
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
proxy := &mcpv1beta1.MCPRemoteProxy{
ObjectMeta: metav1.ObjectMeta{Name: "test-proxy", Namespace: "default"},
Spec: mcpv1beta1.MCPRemoteProxySpec{SessionStorage: tc.storage},
}
env := buildRedisPasswordEnvVarForRemoteProxy(proxy)
if tc.expectEnVar {
require.Len(t, env, 1)
assert.Equal(t, session.RedisPasswordEnvVar, env[0].Name)
assert.Empty(t, env[0].Value, "must not use plaintext Value")
require.NotNil(t, env[0].ValueFrom)
require.NotNil(t, env[0].ValueFrom.SecretKeyRef)
assert.Equal(t, passwordRef.Name, env[0].ValueFrom.SecretKeyRef.Name)
assert.Equal(t, passwordRef.Key, env[0].ValueFrom.SecretKeyRef.Key)
} else {
assert.Empty(t, env)
}
})
}
}
Loading
Loading