Skip to content
Merged
59 changes: 59 additions & 0 deletions internal/providers/compute/k8s/build_log_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ package k8s

import (
"context"
"fmt"
"testing"
"time"

Expand Down Expand Up @@ -106,3 +107,61 @@ func TestEvictStaleBuildLogs_KeepsFreshDropsStale(t *testing.T) {
t.Error("stale entry was not evicted")
}
}

// TestCapBuildLogCacheSize_EvictsOldestOverCap verifies the size cap: a burst
// of fresh (non-stale) snapshots beyond buildLogCacheMaxEntries is bounded by
// evicting the OLDEST entries first, while the most-recent cap-many survive.
// This is the memory-leak guard the TTL sweep alone does not provide — many
// failures inside one TTL window would otherwise grow the map without bound.
func TestCapBuildLogCacheSize_EvictsOldestOverCap(t *testing.T) {
p := &K8sProvider{clientset: fake.NewSimpleClientset()}

base := time.Now()
overflow := 10
total := buildLogCacheMaxEntries + overflow
// All fresh (well within TTL) so eviction is driven purely by the size cap,
// not by staleness. Older capturedAt for lower indexes → those evict first.
for i := 0; i < total; i++ {
key := fmt.Sprintf("app-%04d", i)
p.buildLogCache.Store(key, &buildLogCacheEntry{
lines: []string{key},
capturedAt: base.Add(time.Duration(i) * time.Second),
})
}

p.capBuildLogCacheSize()

var count int
p.buildLogCache.Range(func(_, _ any) bool { count++; return true })
if count != buildLogCacheMaxEntries {
t.Fatalf("cache not capped: got %d entries, want %d", count, buildLogCacheMaxEntries)
}

// The oldest `overflow` entries (lowest indexes) must be gone; the newest
// must remain.
for i := 0; i < overflow; i++ {
if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", i)); ok {
t.Errorf("oldest entry app-%04d should have been evicted", i)
}
}
if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", total-1)); !ok {
t.Errorf("newest entry app-%04d was wrongly evicted", total-1)
}
}

// TestCapBuildLogCacheSize_NoOpUnderCap verifies the cap is a no-op when the
// cache holds fewer than buildLogCacheMaxEntries entries (the common case).
func TestCapBuildLogCacheSize_NoOpUnderCap(t *testing.T) {
p := &K8sProvider{clientset: fake.NewSimpleClientset()}

p.buildLogCache.Store("only", &buildLogCacheEntry{
lines: []string{"x"},
capturedAt: time.Now(),
})

p.capBuildLogCacheSize()

if _, ok := p.buildLogCache.Load("only"); !ok {
t.Error("under-cap entry was wrongly evicted")
}
}
43 changes: 43 additions & 0 deletions internal/providers/compute/k8s/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"log/slog"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -417,6 +418,15 @@ type buildLogCacheEntry struct {
// failed build.
const buildLogCacheTTL = 30 * time.Minute

// buildLogCacheMaxEntries caps the number of build-log snapshots held at once.
// The TTL alone is not a memory bound: a burst of failing builds inside one
// 30-minute window (a broken base image, a wedged registry) would otherwise
// accumulate one ≤buildLogMaxLines-line snapshot per failure with no ceiling.
// When a store pushes the cache past this cap we evict the oldest entries —
// the autopsy for a recent failure is far likelier to be read than one from
// hundreds of failures ago. 256 entries × 200 lines is a comfortable bound.
const buildLogCacheMaxEntries = 256

// New creates a K8sProvider targeting the given namespace.
// buildCtx is optional — when unset, builds fall back to the 1 MiB Secret path.
// Returns an error if the k8s clientset cannot be initialized; the caller
Expand Down Expand Up @@ -1288,10 +1298,43 @@ func (p *K8sProvider) snapshotBuildLogs(ctx context.Context, ns, appID, jobName
lines: lines,
capturedAt: time.Now(),
})
// Evict-after-store: the TTL sweep above only drops *expired* entries, so a
// burst of failures inside one TTL window still grows the map unbounded.
// Cap the live entry count, evicting oldest-first.
p.capBuildLogCacheSize()
slog.Info("k8s.snapshotBuildLogs: captured failed build logs for autopsy",
"app_id", appID, "lines", len(lines))
}

// capBuildLogCacheSize bounds buildLogCache to buildLogCacheMaxEntries, evicting
// the oldest snapshots first. sync.Map has no length, so we snapshot keys +
// capture times in one Range pass; if over the cap we sort by capturedAt and
// delete the excess oldest. A concurrent Store between the Range and the
// Deletes can only leave us at most a few entries over the cap until the next
// failure re-runs this — an acceptable slop for a best-effort autopsy cache.
func (p *K8sProvider) capBuildLogCacheSize() {
type keyedEntry struct {
key any
capturedAt time.Time
}
var entries []keyedEntry
p.buildLogCache.Range(func(k, v any) bool {
if entry, ok := v.(*buildLogCacheEntry); ok {
entries = append(entries, keyedEntry{key: k, capturedAt: entry.capturedAt})
}
return true
})
if len(entries) <= buildLogCacheMaxEntries {
return
}
sort.Slice(entries, func(i, j int) bool {
return entries[i].capturedAt.Before(entries[j].capturedAt)
})
for _, e := range entries[:len(entries)-buildLogCacheMaxEntries] {
p.buildLogCache.Delete(e.key)
}
}

// evictStaleBuildLogs drops buildLogCache entries older than buildLogCacheTTL.
func (p *K8sProvider) evictStaleBuildLogs() {
now := time.Now()
Expand Down
Loading