From 258f60dfc46da1b061ac14170ce524c3dadacfe3 Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Thu, 4 Jun 2026 00:14:46 +0530
Subject: [PATCH] fix(k8s): cap buildLogCache size to bound memory on failure
 bursts (bug-bash #2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

buildLogCache snapshots kaniko build logs of FAILED builds for the
failure autopsy. It was TTL-bounded (30m) and swept stale entries on each
new failure — but had NO size cap. A burst of failing builds inside one
TTL window (a broken base image, a wedged registry, a fork-bomb of bad
deploys) accumulates one ≤200-line snapshot per failure with no ceiling:
unbounded memory growth on a long-lived api pod.

Add buildLogCacheMaxEntries=256 and capBuildLogCacheSize(), called
evict-after-store from snapshotBuildLogs. When a store pushes the live
count past the cap, evict oldest-first (a recent failure's autopsy is far
likelier to be read than one from hundreds of failures ago). sync.Map has
no length, so we snapshot keys+times in one Range pass, sort by
capturedAt, and delete the excess oldest.

Tests: TestCapBuildLogCacheSize_EvictsOldestOverCap (over-cap → newest
survive, oldest evicted), TestCapBuildLogCacheSize_NoOpUnderCap. Both new
funcs 100% covered; verified locally.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../compute/k8s/build_log_cache_test.go       | 59 +++++++++++++++++++
 internal/providers/compute/k8s/client.go      | 43 ++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/internal/providers/compute/k8s/build_log_cache_test.go b/internal/providers/compute/k8s/build_log_cache_test.go
index 7209a36a..c94f3637 100644
--- a/internal/providers/compute/k8s/build_log_cache_test.go
+++ b/internal/providers/compute/k8s/build_log_cache_test.go
@@ -14,6 +14,7 @@ package k8s
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -106,3 +107,61 @@ func TestEvictStaleBuildLogs_KeepsFreshDropsStale(t *testing.T) {
 		t.Error("stale entry was not evicted")
 	}
 }
+
+// TestCapBuildLogCacheSize_EvictsOldestOverCap verifies the size cap: a burst
+// of fresh (non-stale) snapshots beyond buildLogCacheMaxEntries is bounded by
+// evicting the OLDEST entries first, while the most-recent cap-many survive.
+// This is the memory-leak guard the TTL sweep alone does not provide — many
+// failures inside one TTL window would otherwise grow the map without bound.
+func TestCapBuildLogCacheSize_EvictsOldestOverCap(t *testing.T) {
+	p := &K8sProvider{clientset: fake.NewSimpleClientset()}
+
+	base := time.Now()
+	overflow := 10
+	total := buildLogCacheMaxEntries + overflow
+	// All fresh (well within TTL) so eviction is driven purely by the size cap,
+	// not by staleness. Older capturedAt for lower indexes → those evict first.
+	for i := 0; i < total; i++ {
+		key := fmt.Sprintf("app-%04d", i)
+		p.buildLogCache.Store(key, &buildLogCacheEntry{
+			lines:      []string{key},
+			capturedAt: base.Add(time.Duration(i) * time.Second),
+		})
+	}
+
+	p.capBuildLogCacheSize()
+
+	var count int
+	p.buildLogCache.Range(func(_, _ any) bool { count++; return true })
+	if count != buildLogCacheMaxEntries {
+		t.Fatalf("cache not capped: got %d entries, want %d", count, buildLogCacheMaxEntries)
+	}
+
+	// The oldest `overflow` entries (lowest indexes) must be gone; the newest
+	// must remain.
+	for i := 0; i < overflow; i++ {
+		if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", i)); ok {
+			t.Errorf("oldest entry app-%04d should have been evicted", i)
+		}
+	}
+	if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", total-1)); !ok {
+		t.Errorf("newest entry app-%04d was wrongly evicted", total-1)
+	}
+}
+
+// TestCapBuildLogCacheSize_NoOpUnderCap verifies the cap is a no-op when the
+// cache holds fewer than buildLogCacheMaxEntries entries (the common case).
+func TestCapBuildLogCacheSize_NoOpUnderCap(t *testing.T) {
+	p := &K8sProvider{clientset: fake.NewSimpleClientset()}
+
+	p.buildLogCache.Store("only", &buildLogCacheEntry{
+		lines:      []string{"x"},
+		capturedAt: time.Now(),
+	})
+
+	p.capBuildLogCacheSize()
+
+	if _, ok := p.buildLogCache.Load("only"); !ok {
+		t.Error("under-cap entry was wrongly evicted")
+	}
+}
diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go
index a0576671..72f936b5 100644
--- a/internal/providers/compute/k8s/client.go
+++ b/internal/providers/compute/k8s/client.go
@@ -15,6 +15,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"sort"
 	"strings"
 	"sync"
 	"time"
@@ -417,6 +418,15 @@ type buildLogCacheEntry struct {
 // failed build.
 const buildLogCacheTTL = 30 * time.Minute
 
+// buildLogCacheMaxEntries caps the number of build-log snapshots held at once.
+// The TTL alone is not a memory bound: a burst of failing builds inside one
+// 30-minute window (a broken base image, a wedged registry) would otherwise
+// accumulate one ≤buildLogMaxLines-line snapshot per failure with no ceiling.
+// When a store pushes the cache past this cap we evict the oldest entries —
+// the autopsy for a recent failure is far likelier to be read than one from
+// hundreds of failures ago. 256 entries × 200 lines is a comfortable bound.
+const buildLogCacheMaxEntries = 256
+
 // New creates a K8sProvider targeting the given namespace.
 // buildCtx is optional — when unset, builds fall back to the 1 MiB Secret path.
 // Returns an error if the k8s clientset cannot be initialized; the caller
@@ -1288,10 +1298,43 @@ func (p *K8sProvider) snapshotBuildLogs(ctx context.Context, ns, appID, jobName
 		lines:      lines,
 		capturedAt: time.Now(),
 	})
+	// Evict-after-store: the TTL sweep above only drops *expired* entries, so a
+	// burst of failures inside one TTL window still grows the map unbounded.
+	// Cap the live entry count, evicting oldest-first.
+	p.capBuildLogCacheSize()
 	slog.Info("k8s.snapshotBuildLogs: captured failed build logs for autopsy",
 		"app_id", appID, "lines", len(lines))
 }
 
+// capBuildLogCacheSize bounds buildLogCache to buildLogCacheMaxEntries, evicting
+// the oldest snapshots first. sync.Map has no length, so we snapshot keys +
+// capture times in one Range pass; if over the cap we sort by capturedAt and
+// delete the excess oldest. A concurrent Store between the Range and the
+// Deletes can only leave us at most a few entries over the cap until the next
+// failure re-runs this — an acceptable slop for a best-effort autopsy cache.
+func (p *K8sProvider) capBuildLogCacheSize() {
+	type keyedEntry struct {
+		key        any
+		capturedAt time.Time
+	}
+	var entries []keyedEntry
+	p.buildLogCache.Range(func(k, v any) bool {
+		if entry, ok := v.(*buildLogCacheEntry); ok {
+			entries = append(entries, keyedEntry{key: k, capturedAt: entry.capturedAt})
+		}
+		return true
+	})
+	if len(entries) <= buildLogCacheMaxEntries {
+		return
+	}
+	sort.Slice(entries, func(i, j int) bool {
+		return entries[i].capturedAt.Before(entries[j].capturedAt)
+	})
+	for _, e := range entries[:len(entries)-buildLogCacheMaxEntries] {
+		p.buildLogCache.Delete(e.key)
+	}
+}
+
 // evictStaleBuildLogs drops buildLogCache entries older than buildLogCacheTTL.
 func (p *K8sProvider) evictStaleBuildLogs() {
 	now := time.Now()