Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,34 @@ State changes are detected via **fsnotify** on the VM index file (sub-second lat

This ensures blobs referenced by running VMs or saved snapshots are never deleted.

### Log Output

Every collected item is logged at INFO level with a structured `key=value` payload under `gc.<module>`, and a summary line ends the cycle. Sample:

```
INFO gc.snapshot collected id=XEOU... name=ubuntu-hot-testing:v1 bytes=3221225472 last_accessed=2026-04-12T10:30:00Z reason=lru-age
INFO gc.snapshot collected id=2GQVEA... name= bytes=0 last_accessed=never reason=orphan
INFO gc.cloudhypervisor collected id=ABC123 runDir=/var/lib/cocoon/run/cloudhypervisor/ABC123 logDir=/var/log/cocoon/cloudhypervisor/ABC123 reason=orphan-runDir
INFO gc.oci collected blob=b40150c1c2717d... reason=unreferenced
INFO gc.cni collected id=JKLMN netns=cocoon-JKLMN nics=2 reason=orphan
INFO gc.bridge collected id=MNOPQ iface=btMNOPQ-0 reason=orphan-tap
INFO gc.Run completed: cloudhypervisor=1 cni=1 oci=4 snapshot=3 (failures: 0, duration: 230ms)
```

Filter with `awk` / `grep`:

```bash
journalctl -u cocoon-gc.service --since today | grep "gc.snapshot.*reason=lru-"
journalctl -u cocoon-gc.service --since today | awk '/gc.Run completed/'
```

Reasons:
- **snapshot**: `orphan` (dataDir without DB record), `stale-pending` (Create crashed >24h ago), `lru-all` / `lru-age` / `lru-keep` / `lru-size` (multi-criterion uses `+` joiner)
- **cloudhypervisor / firecracker**: `orphan-runDir`, `orphan-logDir`, `stale-creating`
- **images (oci, cloudimg)**: `unreferenced`
- **cni**: `orphan` (netns without active VM)
- **bridge**: `orphan-tap`

### Snapshot LRU Eviction

Bare `cocoon gc` only reclaims **orphans** (on-disk data with no DB record) and **stale pending** records (crashed mid-Create, older than 24h). To also evict healthy snapshots by access recency, pass `--snapshot`:
Expand Down
7 changes: 1 addition & 6 deletions cmd/others/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"fmt"

"github.com/docker/go-units"
"github.com/projecteru2/core/log"
"github.com/spf13/cobra"

cmdcore "github.com/cocoonstack/cocoon/cmd/core"
Expand Down Expand Up @@ -56,11 +55,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error {
netProvider.RegisterGC(o)
gc.Register(o, bridge.GCModule(conf.RootDir))
snapBackend.RegisterGC(o)
if err := o.Run(ctx); err != nil {
return err
}
log.WithFunc("cmd.gc").Info(ctx, "GC completed")
return nil
return o.Run(ctx)
}

func (h Handler) Version(_ *cobra.Command, _ []string) error {
Expand Down
10 changes: 7 additions & 3 deletions gc/module.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ type Module[S any] struct {
Resolve func(ctx context.Context, snap S, others map[string]any) []string

// Collect removes the given IDs (called while the lock is held).
Collect func(ctx context.Context, ids []string) error
Collect func(ctx context.Context, ids []string, snap S) error
}

// Module[S] implements runner — internal to the gc package.
Expand All @@ -37,6 +37,10 @@ func (m Module[S]) resolveTargets(ctx context.Context, snap any, others map[stri
return m.Resolve(ctx, typed, others)
}

func (m Module[S]) collect(ctx context.Context, ids []string) error {
return m.Collect(ctx, ids)
func (m Module[S]) collect(ctx context.Context, ids []string, snap any) error {
typed, ok := snap.(S)
if !ok {
return nil
}
return m.Collect(ctx, ids, typed)
}
28 changes: 27 additions & 1 deletion gc/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ import (
"context"
"errors"
"fmt"
"maps"
"slices"
"strings"
"time"

"github.com/projecteru2/core/log"
)
Expand All @@ -25,6 +28,7 @@ func Register[S any](o *Orchestrator, m Module[S]) {
// Run executes one GC cycle: lock all modules, snapshot, resolve, collect.
// Fail-closed: any busy lock aborts the cycle so cross-module decisions stay consistent.
func (o *Orchestrator) Run(ctx context.Context) error {
start := time.Now()
logger := log.WithFunc("gc.Run")

// Acquire all locks up front; hold until GC finishes.
Expand Down Expand Up @@ -75,14 +79,36 @@ func (o *Orchestrator) Run(ctx context.Context) error {

// Phase 3: collect (skip modules with no targets).
var errs []error
summary := make(map[string]int, len(locked))
failures := 0
for _, m := range locked {
ids := targets[m.getName()]
if len(ids) == 0 {
continue
}
if err := m.collect(ctx, ids); err != nil {
if err := m.collect(ctx, ids, snapshots[m.getName()]); err != nil {
failures++
errs = append(errs, fmt.Errorf("gc %s: %w", m.getName(), err))
}
summary[m.getName()] = len(ids)
}
logger.Infof(ctx, "completed: %s (failures: %d, duration: %s)",
formatSummary(summary), failures, time.Since(start).Truncate(time.Millisecond))
return errors.Join(errs...)
}

// formatSummary renders the per-module collection counts as `m1=N m2=M`, sorted by module name.
func formatSummary(s map[string]int) string {
if len(s) == 0 {
return "nothing to collect"
}
keys := slices.Sorted(maps.Keys(s))
var sb strings.Builder
for i, k := range keys {
if i > 0 {
sb.WriteByte(' ')
}
fmt.Fprintf(&sb, "%s=%d", k, s[k])
}
return sb.String()
}
22 changes: 22 additions & 0 deletions gc/orchestrator_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package gc

import "testing"

func TestFormatSummary(t *testing.T) {
cases := []struct {
name string
in map[string]int
want string
}{
{"empty", map[string]int{}, "nothing to collect"},
{"single", map[string]int{"snapshot": 3}, "snapshot=3"},
{"sorted", map[string]int{"snapshot": 3, "cloudhypervisor": 1, "oci": 12}, "cloudhypervisor=1 oci=12 snapshot=3"},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
if got := formatSummary(tt.in); got != tt.want {
t.Errorf("formatSummary(%v) = %q, want %q", tt.in, got, tt.want)
}
})
}
}
2 changes: 1 addition & 1 deletion gc/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ type runner interface {
getLocker() lock.Locker
readSnapshot(ctx context.Context) (any, error)
resolveTargets(ctx context.Context, snap any, others map[string]any) []string
collect(ctx context.Context, ids []string) error
collect(ctx context.Context, ids []string, snap any) error
}
36 changes: 29 additions & 7 deletions hypervisor/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ package hypervisor
import (
"context"
"errors"
"fmt"
"maps"
"slices"
"time"

"github.com/projecteru2/core/log"

"github.com/cocoonstack/cocoon/gc"
"github.com/cocoonstack/cocoon/types"
"github.com/cocoonstack/cocoon/utils"
Expand All @@ -19,6 +22,7 @@ type VMGCSnapshot struct {
staleCreate []string
runDirs []string
logDirs []string
reasons map[string]string
}

func (s VMGCSnapshot) UsedBlobIDs() map[string]struct{} { return s.blobIDs }
Expand All @@ -31,7 +35,7 @@ func (b *Backend) BuildGCModule() gc.Module[VMGCSnapshot] {
Name: b.Typ,
Locker: b.Locker,
ReadDB: func(_ context.Context) (VMGCSnapshot, error) {
var snap VMGCSnapshot
snap := VMGCSnapshot{reasons: make(map[string]string)}
cutoff := time.Now().Add(-CreatingStateGCGrace)
if err := b.DB.ReadRaw(func(idx *VMIndex) error {
snap.blobIDs = make(map[string]struct{}, len(idx.VMs))
Expand Down Expand Up @@ -64,11 +68,26 @@ func (b *Backend) BuildGCModule() gc.Module[VMGCSnapshot] {
reserved := map[string]struct{}{"db": {}}
runOrphans := utils.FilterUnreferenced(snap.runDirs, snap.vmIDs, reserved)
logOrphans := utils.FilterUnreferenced(snap.logDirs, snap.vmIDs, reserved)
for _, id := range snap.staleCreate {
snap.reasons[id] = "stale-creating"
}
for _, id := range runOrphans {
if _, ok := snap.reasons[id]; !ok {
snap.reasons[id] = "orphan-runDir"
}
}
for _, id := range logOrphans {
if _, ok := snap.reasons[id]; !ok {
snap.reasons[id] = "orphan-logDir"
}
}
candidates := slices.Concat(runOrphans, logOrphans, snap.staleCreate)
slices.Sort(candidates)
return slices.Compact(candidates)
},
Collect: b.GCCollect,
Collect: func(ctx context.Context, ids []string, snap VMGCSnapshot) error {
return b.gcCollect(ctx, ids, snap)
},
}
}

Expand All @@ -81,9 +100,9 @@ func (b *Backend) WatchPath() string {
return b.Conf.IndexFile()
}

// GCCollect kills leftover hypervisor processes and removes orphan dirs/records.
// Runs under the GC orchestrator's flock — uses lock-free DB access.
func (b *Backend) GCCollect(ctx context.Context, ids []string) error {
// gcCollect kills leftover hypervisor processes and removes orphan dirs/records under the orchestrator's flock.
func (b *Backend) gcCollect(ctx context.Context, ids []string, snap VMGCSnapshot) error {
logger := log.WithFunc("gc." + b.Typ)
var errs []error
for _, id := range ids {
runDir, logDir := b.Conf.VMRunDir(id), b.Conf.VMLogDir(id)
Expand All @@ -95,11 +114,14 @@ func (b *Backend) GCCollect(ctx context.Context, ids []string) error {
})
b.killOrphanProcess(ctx, runDir)
if err := RemoveVMDirs(runDir, logDir); err != nil {
errs = append(errs, err)
errs = append(errs, fmt.Errorf("remove vm %s: %w", id, err))
continue
}
logger.Infof(ctx, "collected id=%s runDir=%s logDir=%s reason=%s",
id, runDir, logDir, snap.reasons[id])
}
if err := b.CleanStalePlaceholders(ctx, ids); err != nil {
errs = append(errs, err)
errs = append(errs, fmt.Errorf("clean stale placeholders: %w", err))
}
return errors.Join(errs...)
}
Expand Down
4 changes: 2 additions & 2 deletions images/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ func BuildGCModule[I any](cfg GCModuleConfig[I]) gc.Module[ImageGCSnapshot] {
slices.Sort(candidates)
return slices.Compact(candidates)
},
Collect: func(ctx context.Context, ids []string) error {
return GCCollectBlobs(ctx, cfg.TempDir, cfg.DirOnly, ids, cfg.Removers...)
Collect: func(ctx context.Context, ids []string, _ ImageGCSnapshot) error {
return GCCollectBlobs(ctx, cfg.Name, cfg.TempDir, cfg.DirOnly, ids, cfg.Removers...)
},
}
}
14 changes: 11 additions & 3 deletions images/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package images
import (
"context"
"errors"
"fmt"
"io/fs"
"os"
"strings"
Expand Down Expand Up @@ -104,17 +105,24 @@ func GCStaleTemp(ctx context.Context, dir string, dirOnly bool) []error {
})
}

// GCCollectBlobs removes temp files and blob artifacts by hex ID.
// GCCollectBlobs removes temp files and blob artifacts by hex ID; module names the gc subsystem for log routing.
// removers are called for each hex; fs.ErrNotExist errors are ignored.
func GCCollectBlobs(ctx context.Context, tempDir string, dirOnly bool, ids []string, removers ...func(string) error) error {
func GCCollectBlobs(ctx context.Context, module, tempDir string, dirOnly bool, ids []string, removers ...func(string) error) error {
logger := log.WithFunc("gc." + module)
var errs []error
errs = append(errs, GCStaleTemp(ctx, tempDir, dirOnly)...)
for _, hex := range ids {
var blobErr error
for _, rm := range removers {
if err := rm(hex); err != nil && !errors.Is(err, fs.ErrNotExist) {
errs = append(errs, err)
blobErr = errors.Join(blobErr, err)
}
}
if blobErr != nil {
errs = append(errs, fmt.Errorf("remove blob %s: %w", hex, blobErr))
continue
}
logger.Infof(ctx, "collected blob=%s reason=unreferenced", hex)
}
return errors.Join(errs...)
}
Expand Down
6 changes: 3 additions & 3 deletions network/bridge/gc_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] {
slices.Sort(orphans)
return orphans
},
Collect: func(ctx context.Context, prefixes []string) error {
logger := log.WithFunc("bridge.gc.Collect")
Collect: func(ctx context.Context, prefixes []string, _ bridgeSnapshot) error {
logger := log.WithFunc("gc.bridge")

orphanSet := make(map[string]struct{}, len(prefixes))
for _, p := range prefixes {
Expand All @@ -89,7 +89,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] {
if err := netlink.LinkDel(l); err != nil {
logger.Warnf(ctx, "delete orphan TAP %s: %v", name, err)
} else {
logger.Infof(ctx, "collected orphan TAP %s", name)
logger.Infof(ctx, "collected id=%s iface=%s reason=orphan-tap", prefix, name)
}
}
return nil
Expand Down
2 changes: 1 addition & 1 deletion network/bridge/gc_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func GCModule(rootDir string) gc.Module[bridgeSnapshot] {
Resolve: func(_ context.Context, _ bridgeSnapshot, _ map[string]any) []string {
return nil
},
Collect: func(_ context.Context, _ []string) error {
Collect: func(_ context.Context, _ []string, _ bridgeSnapshot) error {
return nil
},
}
Expand Down
9 changes: 8 additions & 1 deletion network/cni/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"slices"
"strings"

"github.com/projecteru2/core/log"

"github.com/cocoonstack/cocoon/gc"
)

Expand Down Expand Up @@ -62,7 +64,8 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] {
slices.Sort(orphans)
return orphans
},
Collect: func(ctx context.Context, ids []string) error {
Collect: func(ctx context.Context, ids []string, _ cniSnapshot) error {
logger := log.WithFunc("gc.cni")
var errs []error
for _, vmID := range ids {
// 1. Read CNI records for this VM (lockless — orchestrator holds flock).
Expand All @@ -82,6 +85,7 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] {
nsName := netnsName(vmID)
if err := deleteNetns(ctx, nsName); err != nil && !errors.Is(err, fs.ErrNotExist) {
errs = append(errs, fmt.Errorf("remove netns %s: %w", nsName, err))
continue
}

// 4. Clean DB records (lockless write).
Expand All @@ -95,8 +99,11 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] {
return nil
}); err != nil {
errs = append(errs, fmt.Errorf("clean DB for %s: %w", vmID, err))
continue
}
}
logger.Infof(ctx, "collected id=%s netns=%s nics=%d reason=orphan",
vmID, nsName, len(records))
}
return errors.Join(errs...)
},
Expand Down
Loading
Loading