From 258f60dfc46da1b061ac14170ce524c3dadacfe3 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 4 Jun 2026 00:14:46 +0530 Subject: [PATCH] fix(k8s): cap buildLogCache size to bound memory on failure bursts (bug-bash #2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit buildLogCache snapshots kaniko build logs of FAILED builds for the failure autopsy. It was TTL-bounded (30m) and swept stale entries on each new failure — but had NO size cap. A burst of failing builds inside one TTL window (a broken base image, a wedged registry, a fork-bomb of bad deploys) accumulates one ≤200-line snapshot per failure with no ceiling: unbounded memory growth on a long-lived api pod. Add buildLogCacheMaxEntries=256 and capBuildLogCacheSize(), called evict-after-store from snapshotBuildLogs. When a store pushes the live count past the cap, evict oldest-first (a recent failure's autopsy is far likelier to be read than one from hundreds of failures ago). sync.Map has no length, so we snapshot keys+times in one Range pass, sort by capturedAt, and delete the excess oldest. Tests: TestCapBuildLogCacheSize_EvictsOldestOverCap (over-cap → newest survive, oldest evicted), TestCapBuildLogCacheSize_NoOpUnderCap. Both new funcs 100% covered; verified locally. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../compute/k8s/build_log_cache_test.go | 59 +++++++++++++++++++ internal/providers/compute/k8s/client.go | 43 ++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/internal/providers/compute/k8s/build_log_cache_test.go b/internal/providers/compute/k8s/build_log_cache_test.go index 7209a36a..c94f3637 100644 --- a/internal/providers/compute/k8s/build_log_cache_test.go +++ b/internal/providers/compute/k8s/build_log_cache_test.go @@ -14,6 +14,7 @@ package k8s import ( "context" + "fmt" "testing" "time" @@ -106,3 +107,61 @@ func TestEvictStaleBuildLogs_KeepsFreshDropsStale(t *testing.T) { t.Error("stale entry was not evicted") } } + +// TestCapBuildLogCacheSize_EvictsOldestOverCap verifies the size cap: a burst +// of fresh (non-stale) snapshots beyond buildLogCacheMaxEntries is bounded by +// evicting the OLDEST entries first, while the most-recent cap-many survive. +// This is the memory-leak guard the TTL sweep alone does not provide — many +// failures inside one TTL window would otherwise grow the map without bound. +func TestCapBuildLogCacheSize_EvictsOldestOverCap(t *testing.T) { + p := &K8sProvider{clientset: fake.NewSimpleClientset()} + + base := time.Now() + overflow := 10 + total := buildLogCacheMaxEntries + overflow + // All fresh (well within TTL) so eviction is driven purely by the size cap, + // not by staleness. Older capturedAt for lower indexes → those evict first. + for i := 0; i < total; i++ { + key := fmt.Sprintf("app-%04d", i) + p.buildLogCache.Store(key, &buildLogCacheEntry{ + lines: []string{key}, + capturedAt: base.Add(time.Duration(i) * time.Second), + }) + } + + p.capBuildLogCacheSize() + + var count int + p.buildLogCache.Range(func(_, _ any) bool { count++; return true }) + if count != buildLogCacheMaxEntries { + t.Fatalf("cache not capped: got %d entries, want %d", count, buildLogCacheMaxEntries) + } + + // The oldest `overflow` entries (lowest indexes) must be gone; the newest + // must remain. + for i := 0; i < overflow; i++ { + if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", i)); ok { + t.Errorf("oldest entry app-%04d should have been evicted", i) + } + } + if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", total-1)); !ok { + t.Errorf("newest entry app-%04d was wrongly evicted", total-1) + } +} + +// TestCapBuildLogCacheSize_NoOpUnderCap verifies the cap is a no-op when the +// cache holds fewer than buildLogCacheMaxEntries entries (the common case). +func TestCapBuildLogCacheSize_NoOpUnderCap(t *testing.T) { + p := &K8sProvider{clientset: fake.NewSimpleClientset()} + + p.buildLogCache.Store("only", &buildLogCacheEntry{ + lines: []string{"x"}, + capturedAt: time.Now(), + }) + + p.capBuildLogCacheSize() + + if _, ok := p.buildLogCache.Load("only"); !ok { + t.Error("under-cap entry was wrongly evicted") + } +} diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go index a0576671..72f936b5 100644 --- a/internal/providers/compute/k8s/client.go +++ b/internal/providers/compute/k8s/client.go @@ -15,6 +15,7 @@ import ( "log/slog" "os" "path/filepath" + "sort" "strings" "sync" "time" @@ -417,6 +418,15 @@ type buildLogCacheEntry struct { // failed build. const buildLogCacheTTL = 30 * time.Minute +// buildLogCacheMaxEntries caps the number of build-log snapshots held at once. +// The TTL alone is not a memory bound: a burst of failing builds inside one +// 30-minute window (a broken base image, a wedged registry) would otherwise +// accumulate one ≤buildLogMaxLines-line snapshot per failure with no ceiling. +// When a store pushes the cache past this cap we evict the oldest entries — +// the autopsy for a recent failure is far likelier to be read than one from +// hundreds of failures ago. 256 entries × 200 lines is a comfortable bound. +const buildLogCacheMaxEntries = 256 + // New creates a K8sProvider targeting the given namespace. // buildCtx is optional — when unset, builds fall back to the 1 MiB Secret path. // Returns an error if the k8s clientset cannot be initialized; the caller @@ -1288,10 +1298,43 @@ func (p *K8sProvider) snapshotBuildLogs(ctx context.Context, ns, appID, jobName lines: lines, capturedAt: time.Now(), }) + // Evict-after-store: the TTL sweep above only drops *expired* entries, so a + // burst of failures inside one TTL window still grows the map unbounded. + // Cap the live entry count, evicting oldest-first. + p.capBuildLogCacheSize() slog.Info("k8s.snapshotBuildLogs: captured failed build logs for autopsy", "app_id", appID, "lines", len(lines)) } +// capBuildLogCacheSize bounds buildLogCache to buildLogCacheMaxEntries, evicting +// the oldest snapshots first. sync.Map has no length, so we snapshot keys + +// capture times in one Range pass; if over the cap we sort by capturedAt and +// delete the excess oldest. A concurrent Store between the Range and the +// Deletes can only leave us at most a few entries over the cap until the next +// failure re-runs this — an acceptable slop for a best-effort autopsy cache. +func (p *K8sProvider) capBuildLogCacheSize() { + type keyedEntry struct { + key any + capturedAt time.Time + } + var entries []keyedEntry + p.buildLogCache.Range(func(k, v any) bool { + if entry, ok := v.(*buildLogCacheEntry); ok { + entries = append(entries, keyedEntry{key: k, capturedAt: entry.capturedAt}) + } + return true + }) + if len(entries) <= buildLogCacheMaxEntries { + return + } + sort.Slice(entries, func(i, j int) bool { + return entries[i].capturedAt.Before(entries[j].capturedAt) + }) + for _, e := range entries[:len(entries)-buildLogCacheMaxEntries] { + p.buildLogCache.Delete(e.key) + } +} + // evictStaleBuildLogs drops buildLogCache entries older than buildLogCacheTTL. func (p *K8sProvider) evictStaleBuildLogs() { now := time.Now()