From af9f131467f72a155cd1ab3a0576bc6bee30de27 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 01:58:36 +0800 Subject: [PATCH 01/13] feat(metering): add lifecycle event recorder package New subpkg `metering` for append-only VM/snapshot lifecycle endpoints (vm.compute.{start,stop}, vm.storage.{start,stop}, snap.storage.{start,stop}). Exposes raw signals only; tenant attribution and pricing remain in upper layers. - Entry / Shape types, Kind + Reason constants - Recorder interface (Emit-only, fire-and-forget) - FileRecorder appends JSON lines under sync.Mutex, falls back to NopRecorder on open failure so callers never see nil - NopRecorder for tests and unconfigured nodes - CaptureRecorder for in-process assertions No hypervisor integration yet; subsequent commits will wire the 6 hook points in hypervisor/state.go, hypervisor/create.go, snapshot/, etc. --- metering/capture.go | 28 ++++++++++++ metering/capture_test.go | 47 ++++++++++++++++++++ metering/file.go | 45 +++++++++++++++++++ metering/file_test.go | 92 +++++++++++++++++++++++++++++++++++++++ metering/metering.go | 64 +++++++++++++++++++++++++++ metering/metering_test.go | 55 +++++++++++++++++++++++ 6 files changed, 331 insertions(+) create mode 100644 metering/capture.go create mode 100644 metering/capture_test.go create mode 100644 metering/file.go create mode 100644 metering/file_test.go create mode 100644 metering/metering.go create mode 100644 metering/metering_test.go diff --git a/metering/capture.go b/metering/capture.go new file mode 100644 index 00000000..3e877a60 --- /dev/null +++ b/metering/capture.go @@ -0,0 +1,28 @@ +package metering + +import ( + "context" + "sync" +) + +// CaptureRecorder accumulates entries in memory; intended for tests that assert emit sequences. +type CaptureRecorder struct { + mu sync.Mutex + entries []Entry +} + +// Emit appends e to the buffer. +func (r *CaptureRecorder) Emit(_ context.Context, e Entry) { + r.mu.Lock() + defer r.mu.Unlock() + r.entries = append(r.entries, e) +} + +// Entries returns a snapshot copy of accumulated entries. +func (r *CaptureRecorder) Entries() []Entry { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]Entry, len(r.entries)) + copy(out, r.entries) + return out +} diff --git a/metering/capture_test.go b/metering/capture_test.go new file mode 100644 index 00000000..489f1af1 --- /dev/null +++ b/metering/capture_test.go @@ -0,0 +1,47 @@ +package metering + +import ( + "sync" + "testing" +) + +func TestCaptureRecorderBasic(t *testing.T) { + var r CaptureRecorder + ctx := t.Context() + r.Emit(ctx, Entry{Kind: KindVMComputeStart, VMID: "a"}) + r.Emit(ctx, Entry{Kind: KindVMComputeStop, VMID: "a"}) + got := r.Entries() + if len(got) != 2 { + t.Fatalf("got %d entries, want 2", len(got)) + } + if got[0].Kind != KindVMComputeStart || got[1].Kind != KindVMComputeStop { + t.Errorf("got kinds %v %v", got[0].Kind, got[1].Kind) + } +} + +func TestCaptureRecorderEntriesIsCopy(t *testing.T) { + // Mutating the returned slice must not affect subsequent reads. + var r CaptureRecorder + r.Emit(t.Context(), Entry{VMID: "a"}) + got := r.Entries() + got[0].VMID = "tampered" + if again := r.Entries(); again[0].VMID != "a" { + t.Errorf("Entries() must return a copy; got %q after mutation", again[0].VMID) + } +} + +func TestCaptureRecorderConcurrent(t *testing.T) { + var r CaptureRecorder + ctx := t.Context() + const N = 200 + var wg sync.WaitGroup + for range N { + wg.Go(func() { + r.Emit(ctx, Entry{Kind: KindVMComputeStart, VMID: "vm"}) + }) + } + wg.Wait() + if got := len(r.Entries()); got != N { + t.Errorf("got %d entries, want %d", got, N) + } +} diff --git a/metering/file.go b/metering/file.go new file mode 100644 index 00000000..4c534d45 --- /dev/null +++ b/metering/file.go @@ -0,0 +1,45 @@ +package metering + +import ( + "context" + "encoding/json" + "os" + "sync" + + "github.com/projecteru2/core/log" +) + +// FileRecorder appends JSON-encoded entries (one per line) to a file under sync.Mutex. +type FileRecorder struct { + mu sync.Mutex + f *os.File +} + +// NewFileRecorder opens path append-only; on open failure logs a warning and returns NopRecorder so callers never see nil. +func NewFileRecorder(ctx context.Context, path string) Recorder { + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600) //nolint:gosec // internal runtime path + if err != nil { + log.WithFunc("metering.NewFileRecorder").Warnf(ctx, "open %s: %v; metering disabled", path, err) + return NopRecorder{} + } + return &FileRecorder{f: f} +} + +// Emit marshals e and appends one line; write errors are logged and swallowed so the caller's state machine is never blocked. +func (r *FileRecorder) Emit(ctx context.Context, e Entry) { + data, err := json.Marshal(e) + if err != nil { + log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "marshal entry: %v", err) + return + } + r.mu.Lock() + defer r.mu.Unlock() + // Two writes are safe under mu.Lock; without the mutex the newline could interleave with another emit. + if _, err := r.f.Write(data); err != nil { + log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "write entry: %v", err) + return + } + if _, err := r.f.WriteString("\n"); err != nil { + log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "write newline: %v", err) + } +} diff --git a/metering/file_test.go b/metering/file_test.go new file mode 100644 index 00000000..74ef2b28 --- /dev/null +++ b/metering/file_test.go @@ -0,0 +1,92 @@ +package metering + +import ( + "bufio" + "encoding/json" + "os" + "path/filepath" + "sync" + "testing" + "time" +) + +func TestFileRecorderRoundTrip(t *testing.T) { + ctx := t.Context() + path := filepath.Join(t.TempDir(), "ledger.jsonl") + r := NewFileRecorder(ctx, path) + now := time.Now().UTC().Truncate(time.Microsecond) + r.Emit(ctx, Entry{ + EmittedAt: now, + Kind: KindVMComputeStart, + VMID: "vm1", + Reason: ReasonBoot, + Shape: Shape{CPU: 4, MemBytes: 1 << 30}, + }) + r.Emit(ctx, Entry{ + EmittedAt: now.Add(time.Second), + Kind: KindVMComputeStop, + VMID: "vm1", + Reason: ReasonStopUser, + }) + + got := readEntries(t, path) + if len(got) != 2 { + t.Fatalf("got %d entries, want 2", len(got)) + } + if got[0].Kind != KindVMComputeStart || got[1].Kind != KindVMComputeStop { + t.Errorf("got kinds %v %v", got[0].Kind, got[1].Kind) + } + if got[0].Shape.CPU != 4 { + t.Errorf("got CPU=%d, want 4", got[0].Shape.CPU) + } +} + +func TestFileRecorderConcurrent(t *testing.T) { + ctx := t.Context() + path := filepath.Join(t.TempDir(), "ledger.jsonl") + r := NewFileRecorder(ctx, path) + const N = 200 + var wg sync.WaitGroup + for i := range N { + wg.Go(func() { + r.Emit(ctx, Entry{Kind: KindVMComputeStart, VMID: "vm", Shape: Shape{CPU: i}}) + }) + } + wg.Wait() + + got := readEntries(t, path) + if len(got) != N { + t.Errorf("got %d lines, want %d", len(got), N) + } +} + +func TestNewFileRecorderFallback(t *testing.T) { + // Parent dir doesn't exist → OpenFile fails → fallback to NopRecorder. + r := NewFileRecorder(t.Context(), filepath.Join(t.TempDir(), "missing-subdir", "ledger.jsonl")) + if _, ok := r.(NopRecorder); !ok { + t.Errorf("got %T, want NopRecorder", r) + } +} + +func readEntries(t *testing.T, path string) []Entry { + t.Helper() + f, err := os.Open(path) //nolint:gosec // test-controlled path + if err != nil { + t.Fatalf("open: %v", err) + } + defer f.Close() //nolint:errcheck + + var out []Entry + sc := bufio.NewScanner(f) + for sc.Scan() { + var e Entry + if err := json.Unmarshal(sc.Bytes(), &e); err != nil { + t.Fatalf("unmarshal %q: %v", sc.Text(), err) + } + out = append(out, e) + } + if err := sc.Err(); err != nil { + t.Fatalf("scan: %v", err) + } + return out +} diff --git a/metering/metering.go b/metering/metering.go new file mode 100644 index 00000000..7fb989cd --- /dev/null +++ b/metering/metering.go @@ -0,0 +1,64 @@ +// Package metering emits append-only VM/snapshot lifecycle endpoints +// for downstream usage/billing pipelines. cocoon emits raw signals only; +// tenant attribution and pricing are layered above. +package metering + +import ( + "context" + "time" +) + +// Kind identifies a lifecycle endpoint; downstream pairs *.start with *.stop by id. +type Kind string + +// Reason annotates why an endpoint was emitted. +type Reason string + +const ( + KindVMComputeStart Kind = "vm.compute.start" + KindVMComputeStop Kind = "vm.compute.stop" + KindVMStorageStart Kind = "vm.storage.start" + KindVMStorageStop Kind = "vm.storage.stop" + KindSnapStorageStart Kind = "snap.storage.start" + KindSnapStorageStop Kind = "snap.storage.stop" + + ReasonBoot Reason = "boot" + ReasonRestart Reason = "restart" + ReasonClone Reason = "clone" + ReasonRestore Reason = "restore" + ReasonHibernateWake Reason = "hibernate-wake" + ReasonStopUser Reason = "stop-user" + ReasonStopCrash Reason = "stop-crash" + ReasonVMRemove Reason = "vm-rm" + ReasonSnapRemove Reason = "snap-rm" +) + +// Shape is the resource snapshot at the moment an Entry is emitted. +type Shape struct { + CPU int `json:"cpu,omitempty"` + MemBytes int64 `json:"mem_bytes,omitempty"` + StorageBytes int64 `json:"storage_bytes,omitempty"` +} + +// Entry is one append-only lifecycle event. +type Entry struct { + Kind Kind `json:"kind"` + VMID string `json:"vm_id,omitempty"` + SnapshotID string `json:"snapshot_id,omitempty"` + SourceSnapshotID string `json:"source_snapshot_id,omitempty"` + Reason Reason `json:"reason,omitempty"` + Hypervisor string `json:"hypervisor,omitempty"` + Shape Shape `json:"shape"` + EmittedAt time.Time `json:"emitted_at"` +} + +// Recorder accepts lifecycle entries; implementations must be safe for concurrent use. +type Recorder interface { + Emit(context.Context, Entry) +} + +// NopRecorder discards every entry; zero value is usable. +type NopRecorder struct{} + +// Emit is a no-op. +func (NopRecorder) Emit(context.Context, Entry) {} diff --git a/metering/metering_test.go b/metering/metering_test.go new file mode 100644 index 00000000..be34c77b --- /dev/null +++ b/metering/metering_test.go @@ -0,0 +1,55 @@ +package metering + +import ( + "encoding/json" + "testing" + "time" +) + +func TestEntryJSONRoundTrip(t *testing.T) { + in := Entry{ + EmittedAt: time.Date(2026, 5, 20, 1, 2, 3, 0, time.UTC), + Kind: KindVMComputeStart, + VMID: "vm1", + Reason: ReasonBoot, + Hypervisor: "ch", + Shape: Shape{CPU: 4, MemBytes: 1 << 30, StorageBytes: 10 << 30}, + } + data, err := json.Marshal(in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + var out Entry + if err := json.Unmarshal(data, &out); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if out != in { + t.Errorf("round-trip diverged:\n got: %#v\nwant: %#v", out, in) + } +} + +func TestKindWireFormat(t *testing.T) { + // Wire-format strings are consumed by external BQ schema; renaming any + // of these is a breaking change for downstream consumers. + cases := []struct { + got, want string + }{ + {string(KindVMComputeStart), "vm.compute.start"}, + {string(KindVMComputeStop), "vm.compute.stop"}, + {string(KindVMStorageStart), "vm.storage.start"}, + {string(KindVMStorageStop), "vm.storage.stop"}, + {string(KindSnapStorageStart), "snap.storage.start"}, + {string(KindSnapStorageStop), "snap.storage.stop"}, + } + for _, c := range cases { + if c.got != c.want { + t.Errorf("got %q, want %q", c.got, c.want) + } + } +} + +func TestNopRecorder(t *testing.T) { + var r NopRecorder + r.Emit(t.Context(), Entry{Kind: KindVMComputeStart, VMID: "x"}) + // no panic, no state — only assertion is "does not crash" +} From 334b083e5658a7e51c24aef19bb928155defbdab Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 02:30:45 +0800 Subject: [PATCH 02/13] feat(metering): wire 6 lifecycle hook points into hypervisor + snapshot backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend / LocalFile gain a Metering field; nil-safe via metering.OrNop so tests that build raw structs still work. cmd/core lazy-inits one process-wide FileRecorder at $RootDir/metering/ledger.jsonl and injects it into every backend ctor. Hook points emitting metering.Entry: - vm.compute.start hypervisor/state.go BatchMarkStarted (reason: boot/restart by FirstBooted) - vm.compute.stop hypervisor/state.go UpdateStates (only Running→Stopped, reason: stop-user) - vm.storage.start hypervisor/create.go FinalizeCreate (reason: boot) - vm.storage.stop hypervisor/stop.go DeleteAll (reason: vm-rm) - snap.storage.start snapshot/localfile/localfile.go Create - snap.storage.stop snapshot/localfile/localfile.go Delete (reason: snap-rm) DB.Update closures only collect entries; emit runs outside the lock so the ledger write never extends DB lock duration. Tests: hypervisor/state_test.go covers all 3 state-machine emit cases + reason flip + nil-recorder safety; localfile_test.go adds Create/Delete emit round-trip. make lint / vet / test (race+cover) pass on linux+darwin. --- cmd/core/helpers.go | 28 +- cmd/core/metering.go | 37 ++ cmd/others/handler.go | 4 +- cmd/snapshot/handler.go | 12 +- cmd/vm/lifecycle.go | 6 +- cmd/vm/run.go | 20 +- cmd/vm/status.go | 4 +- hypervisor/backend.go | 40 +- hypervisor/clone.go | 41 +- hypervisor/cloudhypervisor/clone.go | 4 +- hypervisor/cloudhypervisor/cloudhypervisor.go | 14 +- hypervisor/cloudhypervisor/direct.go | 11 +- hypervisor/cloudhypervisor/restore.go | 11 +- hypervisor/cloudhypervisor/start.go | 12 +- hypervisor/create.go | 15 +- hypervisor/firecracker/clone.go | 4 +- hypervisor/firecracker/direct.go | 11 +- hypervisor/firecracker/firecracker.go | 14 +- hypervisor/firecracker/restore.go | 11 +- hypervisor/firecracker/start.go | 14 +- hypervisor/hypervisor.go | 4 +- hypervisor/metering.go | 20 + hypervisor/restore.go | 43 +- hypervisor/start.go | 29 +- hypervisor/state.go | 66 ++- hypervisor/state_test.go | 386 ++++++++++++++++++ hypervisor/stop.go | 45 +- metering/file.go | 13 +- metering/metering.go | 8 + snapshot/localfile/gc.go | 17 +- snapshot/localfile/gc_test.go | 101 ++++- snapshot/localfile/import.go | 8 + snapshot/localfile/localfile.go | 33 +- snapshot/localfile/localfile_test.go | 83 +++- 34 files changed, 1029 insertions(+), 140 deletions(-) create mode 100644 cmd/core/metering.go create mode 100644 hypervisor/metering.go create mode 100644 hypervisor/state_test.go diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index a8214804..a3baee15 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -33,14 +33,18 @@ import ( ) var hypervisorFactories = []hypervisorFactory{ - {config.HypervisorCH, func(c *config.Config) (hypervisor.Hypervisor, error) { return cloudhypervisor.New(c) }}, - {config.HypervisorFirecracker, func(c *config.Config) (hypervisor.Hypervisor, error) { return firecracker.New(c) }}, + {config.HypervisorCH, func(ctx context.Context, c *config.Config) (hypervisor.Hypervisor, error) { + return cloudhypervisor.New(c, MeteringRecorder(ctx, c)) + }}, + {config.HypervisorFirecracker, func(ctx context.Context, c *config.Config) (hypervisor.Hypervisor, error) { + return firecracker.New(c, MeteringRecorder(ctx, c)) + }}, } // hypervisorFactory keeps backend lookup and iteration order together. type hypervisorFactory struct { typ config.HypervisorType - ctor func(*config.Config) (hypervisor.Hypervisor, error) + ctor func(context.Context, *config.Config) (hypervisor.Hypervisor, error) } // BaseHandler provides shared config access for all command handlers. @@ -84,7 +88,7 @@ func InitBackends(ctx context.Context, conf *config.Config) ([]imagebackend.Imag if err != nil { return nil, nil, err } - hyper, err := InitHypervisor(conf) + hyper, err := InitHypervisor(ctx, conf) if err != nil { return nil, nil, err } @@ -111,22 +115,22 @@ func InitImageBackendsForPull(ctx context.Context, conf *config.Config) (*oci.OC return ociStore, cloudimgStore, nil } -func InitHypervisor(conf *config.Config) (hypervisor.Hypervisor, error) { +func InitHypervisor(ctx context.Context, conf *config.Config) (hypervisor.Hypervisor, error) { ctor := findHypervisorFactory(conf.Hypervisor()) if ctor == nil { return nil, fmt.Errorf("unknown hypervisor type: %s", conf.Hypervisor()) } - h, err := ctor(conf) + h, err := ctor(ctx, conf) if err != nil { return nil, fmt.Errorf("init hypervisor: %w", err) } return h, nil } -func InitAllHypervisors(conf *config.Config) ([]hypervisor.Hypervisor, error) { +func InitAllHypervisors(ctx context.Context, conf *config.Config) ([]hypervisor.Hypervisor, error) { result := make([]hypervisor.Hypervisor, 0, len(hypervisorFactories)) for _, f := range hypervisorFactories { - h, err := f.ctor(conf) + h, err := f.ctor(ctx, conf) if err != nil { return nil, fmt.Errorf("init %s for GC: %w", f.typ, err) } @@ -136,7 +140,7 @@ func InitAllHypervisors(conf *config.Config) ([]hypervisor.Hypervisor, error) { } func FindHypervisor(ctx context.Context, conf *config.Config, ref string) (hypervisor.Hypervisor, error) { - hypers, err := InitAllHypervisors(conf) + hypers, err := InitAllHypervisors(ctx, conf) if err != nil { return nil, err } @@ -185,8 +189,8 @@ func InitBridgeNetwork(conf *config.Config, bridgeDev string) (network.Network, return p, nil } -func InitSnapshot(conf *config.Config, opts ...localfile.Option) (snapshot.Snapshot, error) { - s, err := localfile.New(conf, opts...) +func InitSnapshot(ctx context.Context, conf *config.Config, opts ...localfile.Option) (snapshot.Snapshot, error) { + s, err := localfile.New(conf, MeteringRecorder(ctx, conf), opts...) if err != nil { return nil, fmt.Errorf("init snapshot backend: %w", err) } @@ -509,7 +513,7 @@ func digestPullRef(image, digest, imageType string) string { return ref.Context().String() + "@" + digest } -func findHypervisorFactory(typ config.HypervisorType) func(*config.Config) (hypervisor.Hypervisor, error) { +func findHypervisorFactory(typ config.HypervisorType) func(context.Context, *config.Config) (hypervisor.Hypervisor, error) { for _, f := range hypervisorFactories { if f.typ == typ { return f.ctor diff --git a/cmd/core/metering.go b/cmd/core/metering.go new file mode 100644 index 00000000..fbe108c6 --- /dev/null +++ b/cmd/core/metering.go @@ -0,0 +1,37 @@ +package core + +import ( + "context" + "os" + "path/filepath" + "sync" + + "github.com/projecteru2/core/log" + + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/metering" +) + +const ( + meteringSubdir = "metering" + meteringFile = "ledger.jsonl" +) + +var ( + meteringOnce sync.Once + meteringRec metering.Recorder +) + +// MeteringRecorder returns a process-wide lifecycle recorder; lazy-init shares one ledger fd across all backends, falls back to NopRecorder on fs error. +func MeteringRecorder(ctx context.Context, conf *config.Config) metering.Recorder { + meteringOnce.Do(func() { + dir := filepath.Join(conf.RootDir, meteringSubdir) + if err := os.MkdirAll(dir, 0o750); err != nil { + log.WithFunc("core.MeteringRecorder").Warnf(ctx, "mkdir %s: %v; metering disabled", dir, err) + meteringRec = metering.NopRecorder{} + return + } + meteringRec = metering.NewFileRecorder(ctx, filepath.Join(dir, meteringFile)) + }) + return meteringRec +} diff --git a/cmd/others/handler.go b/cmd/others/handler.go index 619945aa..8d17260c 100644 --- a/cmd/others/handler.go +++ b/cmd/others/handler.go @@ -35,7 +35,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { if err != nil { return err } - snapBackend, err := cmdcore.InitSnapshot(conf, localfile.WithGCPolicy(policy)) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf, localfile.WithGCPolicy(policy)) if err != nil { return err } @@ -45,7 +45,7 @@ func (h Handler) GC(cmd *cobra.Command, _ []string) error { b.RegisterGC(o) } // Register ALL hypervisor backends so GC protects blobs from both CH and FC VMs. - hypers, hyperErr := cmdcore.InitAllHypervisors(conf) + hypers, hyperErr := cmdcore.InitAllHypervisors(ctx, conf) if hyperErr != nil { return hyperErr } diff --git a/cmd/snapshot/handler.go b/cmd/snapshot/handler.go index 3711ea6e..56537513 100644 --- a/cmd/snapshot/handler.go +++ b/cmd/snapshot/handler.go @@ -35,7 +35,7 @@ func (h Handler) Save(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("find VM %s: %w", vmRef, err) } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -85,7 +85,7 @@ func (h Handler) List(cmd *cobra.Command, _ []string) error { if err != nil { return err } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -148,7 +148,7 @@ func (h Handler) Inspect(cmd *cobra.Command, args []string) error { if err != nil { return err } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -166,7 +166,7 @@ func (h Handler) Export(cmd *cobra.Command, args []string) (err error) { return err } logger := log.WithFunc("cmd.snapshot.export") - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -258,7 +258,7 @@ func (h Handler) Import(cmd *cobra.Command, args []string) error { return err } logger := log.WithFunc("cmd.snapshot.import") - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -295,7 +295,7 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return err } logger := log.WithFunc("cmd.snapshot.rm") - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 86715b79..2eddc7b2 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -52,7 +52,7 @@ func (h Handler) Start(cmd *cobra.Command, args []string) error { return err } - hypers, err := cmdcore.InitAllHypervisors(conf) + hypers, err := cmdcore.InitAllHypervisors(ctx, conf) if err != nil { return err } @@ -86,7 +86,7 @@ func (h Handler) Stop(cmd *cobra.Command, args []string) error { conf.StopTimeoutSeconds = timeout } - hypers, err := cmdcore.InitAllHypervisors(conf) + hypers, err := cmdcore.InitAllHypervisors(ctx, conf) if err != nil { return err } @@ -208,7 +208,7 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { force, _ := cmd.Flags().GetBool("force") - hypers, err := cmdcore.InitAllHypervisors(conf) + hypers, err := cmdcore.InitAllHypervisors(ctx, conf) if err != nil { return err } diff --git a/cmd/vm/run.go b/cmd/vm/run.go index 9ebf84c4..295956c9 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -84,7 +84,7 @@ func (h Handler) Clone(cmd *cobra.Command, args []string) error { return h.cloneFromDir(ctx, cmd, conf, fromDir, logger) } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -98,7 +98,7 @@ func (h Handler) Clone(cmd *cobra.Command, args []string) error { conf.UseFirecracker = snapInfo.Hypervisor == string(config.HypervisorFirecracker) } - hyper, err := cmdcore.InitHypervisor(conf) + hyper, err := cmdcore.InitHypervisor(ctx, conf) if err != nil { return err } @@ -161,7 +161,7 @@ func (h Handler) Restore(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("find VM %s: %w", vmRef, err) } - snapBackend, err := cmdcore.InitSnapshot(conf) + snapBackend, err := cmdcore.InitSnapshot(ctx, conf) if err != nil { return err } @@ -201,7 +201,7 @@ func (h Handler) Restore(cmd *cobra.Command, args []string) error { logger.Infof(ctx, "restoring VM %s from snapshot %s ...", vmRef, snapRef) - result, err := hyper.Restore(ctx, vmRef, vmCfg, stream) + result, err := hyper.Restore(ctx, vmRef, vmCfg, stream, snapInfo.ID) if err != nil { return fmt.Errorf("restore: %w", err) } @@ -242,7 +242,7 @@ func (h Handler) restoreFromDir(ctx context.Context, cmd *cobra.Command, conf *c if err != nil { return err } - return h.runDirectRestore(ctx, cmd, dcr, vmRef, vmCfg, dir, + return h.runDirectRestore(ctx, cmd, dcr, vmRef, vmCfg, dir, cfg.ID, fmt.Sprintf("dir %s", dir), logger) } @@ -267,7 +267,7 @@ func (h Handler) cloneFromDir(ctx context.Context, cmd *cobra.Command, conf *con if cfg.Hypervisor != "" { localConf.UseFirecracker = cfg.Hypervisor == string(config.HypervisorFirecracker) } - hyper, err := cmdcore.InitHypervisor(&localConf) + hyper, err := cmdcore.InitHypervisor(ctx, &localConf) if err != nil { return err } @@ -366,21 +366,21 @@ func (h Handler) restoreDirect(ctx context.Context, cmd *cobra.Command, snapRef, if !ok { return false, nil } - dataDir, _, err := da.DataDir(ctx, snapRef) + dataDir, snapCfg, err := da.DataDir(ctx, snapRef) if err != nil { return true, fmt.Errorf("open snapshot: %w", err) } - return true, h.runDirectRestore(ctx, cmd, dcr, vmRef, vmCfg, dataDir, + return true, h.runDirectRestore(ctx, cmd, dcr, vmRef, vmCfg, dataDir, snapCfg.ID, fmt.Sprintf("snapshot %s", snapRef), logger) } // runDirectRestore is the shared tail for the snapshot-DB and --from-dir restore paths: log, DirectRestore, output. -func (h Handler) runDirectRestore(ctx context.Context, cmd *cobra.Command, dcr hypervisor.Direct, vmRef string, vmCfg *types.VMConfig, srcDir, sourceLabel string, logger *log.Fields) error { +func (h Handler) runDirectRestore(ctx context.Context, cmd *cobra.Command, dcr hypervisor.Direct, vmRef string, vmCfg *types.VMConfig, srcDir, sourceSnapshotID, sourceLabel string, logger *log.Fields) error { wantJSON := cmdcore.WantJSON(cmd) if !wantJSON { logger.Infof(ctx, "restoring VM %s from %s (direct) ...", vmRef, sourceLabel) } - result, err := dcr.DirectRestore(ctx, vmRef, vmCfg, srcDir) + result, err := dcr.DirectRestore(ctx, vmRef, vmCfg, srcDir, sourceSnapshotID) if err != nil { return fmt.Errorf("restore: %w", err) } diff --git a/cmd/vm/status.go b/cmd/vm/status.go index 074418ea..fba47329 100644 --- a/cmd/vm/status.go +++ b/cmd/vm/status.go @@ -47,7 +47,7 @@ func (h Handler) List(cmd *cobra.Command, _ []string) error { return err } - hypers, err := cmdcore.InitAllHypervisors(conf) + hypers, err := cmdcore.InitAllHypervisors(ctx, conf) if err != nil { return err } @@ -94,7 +94,7 @@ func (h Handler) Status(cmd *cobra.Command, args []string) error { } format, _ := cmd.Flags().GetString("format") - hypers, hyperErr := cmdcore.InitAllHypervisors(conf) + hypers, hyperErr := cmdcore.InitAllHypervisors(ctx, conf) if hyperErr != nil { return hyperErr } diff --git a/hypervisor/backend.go b/hypervisor/backend.go index 4bb46dd2..320ff989 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -8,6 +8,7 @@ import ( "time" "github.com/cocoonstack/cocoon/lock" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/storage" "github.com/cocoonstack/cocoon/types" ) @@ -57,10 +58,11 @@ type BackendConfig interface { // Backend provides shared store operations for hypervisor backends. type Backend struct { - Typ string - Conf BackendConfig - DB storage.Store[VMIndex] - Locker lock.Locker + Typ string + Conf BackendConfig + DB storage.Store[VMIndex] + Locker lock.Locker + Metering metering.Recorder } // LaunchSpec is the per-call input to Backend.LaunchVMProcess. Shared @@ -75,24 +77,26 @@ type LaunchSpec struct { // RestoreSpec carries the backend-specific hooks for Backend.RestoreSequence. type RestoreSpec struct { - VMCfg *types.VMConfig - Snapshot io.Reader - Preflight func(stagingDir string, rec *VMRecord) error - Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract - BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW - AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) + VMCfg *types.VMConfig + Snapshot io.Reader + SourceSnapshotID string // for metering lineage; emitted on the restore close+open events + Preflight func(stagingDir string, rec *VMRecord) error + Kill func(ctx context.Context, vmID string, rec *VMRecord) error + Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract + BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW + AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } // DirectRestoreSpec is RestoreSpec for a local srcDir rather than a tar; Populate replaces staging+merge. type DirectRestoreSpec struct { - VMCfg *types.VMConfig - SrcDir string - Preflight func(srcDir string, rec *VMRecord) error - Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error - Populate func(rec *VMRecord, srcDir string) error - AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) + VMCfg *types.VMConfig + SrcDir string + SourceSnapshotID string + Preflight func(srcDir string, rec *VMRecord) error + Kill func(ctx context.Context, vmID string, rec *VMRecord) error + Wrap func(rec *VMRecord, fn func() error) error + Populate func(rec *VMRecord, srcDir string) error + AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } // CreateSpec carries CreateSequence inputs; Prepare returns final storage configs (COW + data disks). diff --git a/hypervisor/clone.go b/hypervisor/clone.go index 354209f5..6d7b9640 100644 --- a/hypervisor/clone.go +++ b/hypervisor/clone.go @@ -6,6 +6,7 @@ import ( "io" "time" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) @@ -34,12 +35,15 @@ func (b *Backend) CloneSetup(ctx context.Context, vmID string, vmCfg *types.VMCo return runDir, logDir, now, cleanup, nil } +// AfterExtractFn finalizes a cloned VM after snapshot files are in place; sourceSnapshotID flows through to the metering Entry so downstream can trace lineage. +type AfterExtractFn func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) + // DirectCloneBase clones from a local snapshot directory. Used when the snapshot lives on the same host (no tar streaming needed). func (b *Backend) DirectCloneBase( ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, snapshotConfig *types.SnapshotConfig, srcDir string, cloneFiles func(dstDir, srcDir string) error, - afterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time) (*types.VM, error), + afterExtract AfterExtractFn, ) (_ *types.VM, err error) { runDir, logDir, now, cleanup, err := b.CloneSetup(ctx, vmID, vmCfg, snapshotConfig) if err != nil { @@ -55,14 +59,14 @@ func (b *Backend) DirectCloneBase( return nil, fmt.Errorf("clone snapshot files: %w", err) } - return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now) + return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now, snapshotConfig.ID) } // CloneFromStream clones from a tar stream into a fresh runDir. Used when the snapshot arrives over the network (cross-node clone). func (b *Backend) CloneFromStream( ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, snapshotConfig *types.SnapshotConfig, snapshot io.Reader, - afterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time) (*types.VM, error), + afterExtract AfterExtractFn, ) (_ *types.VM, err error) { runDir, logDir, now, cleanup, err := b.CloneSetup(ctx, vmID, vmCfg, snapshotConfig) if err != nil { @@ -78,12 +82,14 @@ func (b *Backend) CloneFromStream( return nil, fmt.Errorf("extract snapshot: %w", err) } - return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now) + return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now, snapshotConfig.ID) } // FinalizeClone updates the cloned VM's record in place after restore-and-resume. -func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}) error { - return b.DB.Update(ctx, func(idx *VMIndex) error { +// Emits metering vm.storage.start + vm.compute.start with reason clone so the +// new VM has an opening interval even though it skipped Create/BatchMarkStarted. +func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}, sourceSnapshotID string) error { + if err := b.DB.Update(ctx, func(idx *VMIndex) error { r, err := idx.GetRecord(vmID) if err != nil { return err @@ -95,5 +101,28 @@ func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM r.ImageBlobIDs = blobIDs } return nil + }); err != nil { + return err + } + now := time.Now() + shape := shapeFromConfig(info.Config) + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStart, + VMID: vmID, + SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonClone, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, + }) + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMComputeStart, + VMID: vmID, + SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonClone, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, }) + return nil } diff --git a/hypervisor/cloudhypervisor/clone.go b/hypervisor/cloudhypervisor/clone.go index 4e6c8e4a..1f8a544f 100644 --- a/hypervisor/cloudhypervisor/clone.go +++ b/hypervisor/cloudhypervisor/clone.go @@ -30,7 +30,7 @@ func (ch *CloudHypervisor) Clone(ctx context.Context, vmID string, vmCfg *types. return ch.CloneFromStream(ctx, vmID, vmCfg, net, snapshotConfig, snapshot, ch.cloneAfterExtract) } -func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time) (*types.VM, error) { +func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) { networkConfigs := net.NetworkConfigs logger := log.WithFunc("cloudhypervisor.Clone") @@ -132,7 +132,7 @@ func (ch *CloudHypervisor) cloneAfterExtract(ctx context.Context, vmID string, v NetSetup: net, CreatedAt: now, UpdatedAt: now, StartedAt: &now, } - if err := ch.FinalizeClone(ctx, vmID, info, bootCfg, nil); err != nil { + if err := ch.FinalizeClone(ctx, vmID, info, bootCfg, nil, sourceSnapshotID); err != nil { ch.AbortLaunch(ctx, pid, sockPath, runDir, runtimeFiles) return nil, fmt.Errorf("finalize VM record: %w", err) } diff --git a/hypervisor/cloudhypervisor/cloudhypervisor.go b/hypervisor/cloudhypervisor/cloudhypervisor.go index 99fb442a..04943582 100644 --- a/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -7,6 +7,7 @@ import ( "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/hypervisor" "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/metering" storejson "github.com/cocoonstack/cocoon/storage/json" ) @@ -25,8 +26,8 @@ type CloudHypervisor struct { conf *Config } -// New creates a CloudHypervisor backend. -func New(conf *config.Config) (*CloudHypervisor, error) { +// New creates a CloudHypervisor backend. rec may be nil; the backend falls back to NopRecorder for emit calls. +func New(conf *config.Config, rec metering.Recorder) (*CloudHypervisor, error) { if conf == nil { return nil, fmt.Errorf("config is nil") } @@ -38,10 +39,11 @@ func New(conf *config.Config) (*CloudHypervisor, error) { store := storejson.New[hypervisor.VMIndex](cfg.IndexFile(), locker) return &CloudHypervisor{ Backend: &hypervisor.Backend{ - Typ: typ, - Conf: cfg, - DB: store, - Locker: locker, + Typ: typ, + Conf: cfg, + DB: store, + Locker: locker, + Metering: rec, }, conf: cfg, }, nil diff --git a/hypervisor/cloudhypervisor/direct.go b/hypervisor/cloudhypervisor/direct.go index 0b8b9934..7312feca 100644 --- a/hypervisor/cloudhypervisor/direct.go +++ b/hypervisor/cloudhypervisor/direct.go @@ -15,12 +15,13 @@ func (ch *CloudHypervisor) DirectClone(ctx context.Context, vmID string, vmCfg * return ch.DirectCloneBase(ctx, vmID, vmCfg, net, snapshotConfig, srcDir, cloneSnapshotFiles, ch.cloneAfterExtract) } -func (ch *CloudHypervisor) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir string) (*types.VM, error) { +func (ch *CloudHypervisor) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir, sourceSnapshotID string) (*types.VM, error) { return ch.DirectRestoreSequence(ctx, vmRef, hypervisor.DirectRestoreSpec{ - VMCfg: vmCfg, - SrcDir: srcDir, - Preflight: ch.preflightRestore, - Kill: ch.killForRestore, + VMCfg: vmCfg, + SrcDir: srcDir, + SourceSnapshotID: sourceSnapshotID, + Preflight: ch.preflightRestore, + Kill: ch.killForRestore, Populate: func(rec *hypervisor.VMRecord, srcDir string) error { return hypervisor.PopulateFromSrc(rec.RunDir, srcDir, cleanSnapshotFiles, cloneSnapshotFiles) }, diff --git a/hypervisor/cloudhypervisor/restore.go b/hypervisor/cloudhypervisor/restore.go index 24631b7c..79aa5413 100644 --- a/hypervisor/cloudhypervisor/restore.go +++ b/hypervisor/cloudhypervisor/restore.go @@ -13,12 +13,13 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -func (ch *CloudHypervisor) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader) (*types.VM, error) { +func (ch *CloudHypervisor) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader, sourceSnapshotID string) (*types.VM, error) { return ch.RestoreSequence(ctx, vmRef, hypervisor.RestoreSpec{ - VMCfg: vmCfg, - Snapshot: snapshot, - Preflight: ch.preflightRestore, - Kill: ch.killForRestore, + VMCfg: vmCfg, + Snapshot: snapshot, + SourceSnapshotID: sourceSnapshotID, + Preflight: ch.preflightRestore, + Kill: ch.killForRestore, AfterExtract: func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *hypervisor.VMRecord) (*types.VM, error) { directBoot := isDirectBoot(rec.BootConfig) return ch.restoreAfterExtract(ctx, vmID, vmCfg, rec, directBoot) diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index b9d025b8..a951a879 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -17,17 +17,17 @@ func (ch *CloudHypervisor) Start(ctx context.Context, refs []string) ([]string, return ch.StartAll(ctx, refs, ch.startOne) } -func (ch *CloudHypervisor) startOne(ctx context.Context, id string) error { +func (ch *CloudHypervisor) startOne(ctx context.Context, id string) (bool, error) { rec, err := ch.PrepareStart(ctx, id, runtimeFiles) if err != nil { - return err + return false, err } if rec == nil { - return nil + return false, nil // already running — no-op } if vErr := types.ValidateStorageConfigs(rec.StorageConfigs); vErr != nil { ch.MarkError(ctx, id) - return fmt.Errorf("storage invariants violated: %w", vErr) + return false, fmt.Errorf("storage invariants violated: %w", vErr) } sockPath := hypervisor.SocketPath(rec.RunDir) @@ -39,9 +39,9 @@ func (ch *CloudHypervisor) startOne(ctx context.Context, id string) error { if _, err = ch.launchProcess(ctx, rec, sockPath, args, rec.ResolvedNetnsPath()); err != nil { ch.MarkError(ctx, id) - return fmt.Errorf("launch VM: %w", err) + return false, fmt.Errorf("launch VM: %w", err) } - return nil + return true, nil } func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VMRecord, socketPath string, args []string, netnsPath string) (int, error) { diff --git a/hypervisor/create.go b/hypervisor/create.go index 01595d1e..b5c16e08 100644 --- a/hypervisor/create.go +++ b/hypervisor/create.go @@ -7,6 +7,7 @@ import ( "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) @@ -49,8 +50,9 @@ func (b *Backend) RollbackCreate(ctx context.Context, id, name string) { } // FinalizeCreate writes a populated VM record to DB, replacing the placeholder. +// Emits metering vm.storage.start once the record is persisted. func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}) error { - return b.DB.Update(ctx, func(idx *VMIndex) error { + if err := b.DB.Update(ctx, func(idx *VMIndex) error { existing, err := idx.GetRecord(id) if err != nil { return err @@ -63,7 +65,18 @@ func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, LogDir: existing.LogDir, } return nil + }); err != nil { + return err + } + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStart, + VMID: id, + Reason: metering.ReasonBoot, + Hypervisor: b.Typ, + Shape: shapeFromConfig(info.Config), + EmittedAt: time.Now(), }) + return nil } // CreateSequence is the shared placeholder→finalize create skeleton; a mid-flight crash rolls back DB + rundir so GC has nothing to reconcile. diff --git a/hypervisor/firecracker/clone.go b/hypervisor/firecracker/clone.go index 9203ad21..9ceb4aa9 100644 --- a/hypervisor/firecracker/clone.go +++ b/hypervisor/firecracker/clone.go @@ -27,7 +27,7 @@ func (fc *Firecracker) Clone(ctx context.Context, vmID string, vmCfg *types.VMCo return fc.CloneFromStream(ctx, vmID, vmCfg, net, snapshotConfig, snapshot, fc.cloneAfterExtract) } -func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time) (*types.VM, error) { +func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) { networkConfigs := net.NetworkConfigs logger := log.WithFunc("firecracker.Clone") @@ -102,7 +102,7 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg NetSetup: net, CreatedAt: now, UpdatedAt: now, StartedAt: &now, } - if err := fc.FinalizeClone(ctx, vmID, info, bootCfg, blobIDs); err != nil { + if err := fc.FinalizeClone(ctx, vmID, info, bootCfg, blobIDs, sourceSnapshotID); err != nil { fc.AbortLaunch(ctx, pid, sockPath, runDir, runtimeFiles) return nil, fmt.Errorf("finalize VM record: %w", err) } diff --git a/hypervisor/firecracker/direct.go b/hypervisor/firecracker/direct.go index 439afdf8..0300b0f6 100644 --- a/hypervisor/firecracker/direct.go +++ b/hypervisor/firecracker/direct.go @@ -13,12 +13,13 @@ func (fc *Firecracker) DirectClone(ctx context.Context, vmID string, vmCfg *type return fc.DirectCloneBase(ctx, vmID, vmCfg, net, snapshotConfig, srcDir, cloneSnapshotFiles, fc.cloneAfterExtract) } -func (fc *Firecracker) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir string) (*types.VM, error) { +func (fc *Firecracker) DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir, sourceSnapshotID string) (*types.VM, error) { return fc.DirectRestoreSequence(ctx, vmRef, hypervisor.DirectRestoreSpec{ - VMCfg: vmCfg, - SrcDir: srcDir, - Preflight: fc.preflightRestore, - Kill: fc.killForRestore, + VMCfg: vmCfg, + SrcDir: srcDir, + SourceSnapshotID: sourceSnapshotID, + Preflight: fc.preflightRestore, + Kill: fc.killForRestore, // Lock writable disks so recoverStaleBackup heals stale data-*.raw.cocoon-clone-backup // before restore overwrites them; otherwise a future clone renames backup over restored data. Wrap: func(rec *hypervisor.VMRecord, inner func() error) error { diff --git a/hypervisor/firecracker/firecracker.go b/hypervisor/firecracker/firecracker.go index 685d9f37..67449d03 100644 --- a/hypervisor/firecracker/firecracker.go +++ b/hypervisor/firecracker/firecracker.go @@ -7,6 +7,7 @@ import ( "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/hypervisor" "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/metering" storejson "github.com/cocoonstack/cocoon/storage/json" ) @@ -26,8 +27,8 @@ type Firecracker struct { conf *Config } -// New creates a Firecracker backend. -func New(conf *config.Config) (*Firecracker, error) { +// New creates a Firecracker backend. rec may be nil; the backend falls back to NopRecorder for emit calls. +func New(conf *config.Config, rec metering.Recorder) (*Firecracker, error) { if conf == nil { return nil, fmt.Errorf("config is nil") } @@ -39,10 +40,11 @@ func New(conf *config.Config) (*Firecracker, error) { store := storejson.New[hypervisor.VMIndex](cfg.IndexFile(), locker) return &Firecracker{ Backend: &hypervisor.Backend{ - Typ: typ, - Conf: cfg, - DB: store, - Locker: locker, + Typ: typ, + Conf: cfg, + DB: store, + Locker: locker, + Metering: rec, }, conf: cfg, }, nil diff --git a/hypervisor/firecracker/restore.go b/hypervisor/firecracker/restore.go index ac774283..fc7e0a95 100644 --- a/hypervisor/firecracker/restore.go +++ b/hypervisor/firecracker/restore.go @@ -14,12 +14,13 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -func (fc *Firecracker) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader) (*types.VM, error) { +func (fc *Firecracker) Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader, sourceSnapshotID string) (*types.VM, error) { return fc.RestoreSequence(ctx, vmRef, hypervisor.RestoreSpec{ - VMCfg: vmCfg, - Snapshot: snapshot, - Preflight: fc.preflightRestore, - Kill: fc.killForRestore, + VMCfg: vmCfg, + Snapshot: snapshot, + SourceSnapshotID: sourceSnapshotID, + Preflight: fc.preflightRestore, + Kill: fc.killForRestore, // Lock writable disks so recoverStaleBackup heals stale data-*.raw.cocoon-clone-backup // before restore overwrites them; otherwise a future clone renames backup over restored data. Wrap: func(rec *hypervisor.VMRecord, inner func() error) error { diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 9c40eb6f..426f76e9 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -23,17 +23,17 @@ func (fc *Firecracker) Start(ctx context.Context, refs []string) ([]string, erro return fc.StartAll(ctx, refs, fc.startOne) } -func (fc *Firecracker) startOne(ctx context.Context, id string) error { +func (fc *Firecracker) startOne(ctx context.Context, id string) (bool, error) { rec, err := fc.PrepareStart(ctx, id, runtimeFiles) if err != nil { - return err + return false, err } if rec == nil { - return nil + return false, nil // already running — no-op } if vErr := types.ValidateStorageConfigs(rec.StorageConfigs); vErr != nil { fc.MarkError(ctx, id) - return fmt.Errorf("storage invariants violated: %w", vErr) + return false, fmt.Errorf("storage invariants violated: %w", vErr) } sockPath := hypervisor.SocketPath(rec.RunDir) @@ -41,15 +41,15 @@ func (fc *Firecracker) startOne(ctx context.Context, id string) error { pid, err := fc.launchProcess(ctx, rec, sockPath, rec.ResolvedNetnsPath()) if err != nil { fc.MarkError(ctx, id) - return fmt.Errorf("launch VM: %w", err) + return false, fmt.Errorf("launch VM: %w", err) } if err := fc.configureVM(ctx, utils.NewSocketHTTPClient(sockPath), rec); err != nil { fc.AbortLaunch(ctx, pid, sockPath, rec.RunDir, runtimeFiles) fc.MarkError(ctx, id) - return fmt.Errorf("configure VM: %w", err) + return false, fmt.Errorf("configure VM: %w", err) } - return nil + return true, nil } // configureVM sends pre-boot config via REST then InstanceStart. diff --git a/hypervisor/hypervisor.go b/hypervisor/hypervisor.go index 96bd149e..6445e82c 100644 --- a/hypervisor/hypervisor.go +++ b/hypervisor/hypervisor.go @@ -29,7 +29,7 @@ type Hypervisor interface { LogPath(ctx context.Context, ref string) (string, error) Snapshot(ctx context.Context, ref string) (*types.SnapshotConfig, io.ReadCloser, error) Clone(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, snapshotConfig *types.SnapshotConfig, snapshot io.Reader) (*types.VM, error) - Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader) (*types.VM, error) + Restore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, snapshot io.Reader, sourceSnapshotID string) (*types.VM, error) RegisterGC(*gc.Orchestrator) } @@ -42,5 +42,5 @@ type Watchable interface { // Direct is an optional interface for hypervisors that support clone/restore from a local snapshot directory. type Direct interface { DirectClone(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, snapshotConfig *types.SnapshotConfig, srcDir string) (*types.VM, error) - DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir string) (*types.VM, error) + DirectRestore(ctx context.Context, vmRef string, vmCfg *types.VMConfig, srcDir, sourceSnapshotID string) (*types.VM, error) } diff --git a/hypervisor/metering.go b/hypervisor/metering.go new file mode 100644 index 00000000..0361cf00 --- /dev/null +++ b/hypervisor/metering.go @@ -0,0 +1,20 @@ +package hypervisor + +import ( + "github.com/cocoonstack/cocoon/metering" + "github.com/cocoonstack/cocoon/types" +) + +// meter returns Backend.Metering or NopRecorder so emit sites can call .Emit without nil-checking. +func (b *Backend) meter() metering.Recorder { + return metering.OrNop(b.Metering) +} + +// shapeFromConfig builds a metering.Shape from a VMConfig's billable fields. +func shapeFromConfig(c types.VMConfig) metering.Shape { + return metering.Shape{ + CPU: c.CPU, + MemBytes: c.Memory, + StorageBytes: c.Storage, + } +} diff --git a/hypervisor/restore.go b/hypervisor/restore.go index a2269c1e..5a163a16 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -8,6 +8,7 @@ import ( "os" "time" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) @@ -39,7 +40,9 @@ func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, return vmID, &rec, nil } -// FinalizeRestore updates DB and assembles the returned VM after restore. +// FinalizeRestore updates DB and assembles the returned VM after restore. Metering +// emit lives in RestoreSequence/DirectRestoreSequence so the prior compute interval +// is closed at the kill boundary, not only on full-restore success. func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord, pid int) (*types.VM, error) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { @@ -66,6 +69,34 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types return &info, nil } +// emitRestoreComputeStop closes only the prior compute interval at the kill boundary. +// Storage is NOT closed here because the on-disk files survive: a downstream restore +// failure leaves the VM in Error state with its old storage intact, and vm rm will +// later close the storage interval with reason vm-rm. +func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldShape metering.Shape, sourceSnapshotID string) { + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: time.Now(), + }) +} + +// emitRestoreSuccess emits the storage shape transition and reopens the compute +// interval; called only after the restore sequence has fully succeeded. +func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { + now := time.Now() + newShape := shapeFromConfig(vm.Config) + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, + }) + for _, kind := range []metering.Kind{metering.KindVMStorageStart, metering.KindVMComputeStart} { + b.meter().Emit(ctx, metering.Entry{ + Kind: kind, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: newShape, EmittedAt: now, + }) + } +} + // RestoreSequence is the shared restore skeleton. Staging happens before the kill so a preflight failure leaves the original VM running. func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec RestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { @@ -85,9 +116,15 @@ func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec Restor if preflightErr := spec.Preflight(stagingDir, rec); preflightErr != nil { return nil, fmt.Errorf("snapshot preflight: %w", preflightErr) } + oldShape := shapeFromConfig(rec.Config) if killErr := spec.Kill(ctx, vmID, rec); killErr != nil { return nil, killErr } + // kill succeeded → the prior compute interval is over regardless of restore + // outcome; emit compute.stop immediately. Storage stays open: if restore + // fails the on-disk files are still the old shape and vm rm later closes + // the storage interval with reason vm-rm. + b.emitRestoreComputeStop(ctx, vmID, oldShape, spec.SourceSnapshotID) var result *types.VM inner := func() error { @@ -111,6 +148,7 @@ func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec Restor } else if err := inner(); err != nil { return nil, err } + b.emitRestoreSuccess(ctx, result, oldShape, spec.SourceSnapshotID) return result, nil } @@ -127,9 +165,11 @@ func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec if preflightErr := spec.Preflight(spec.SrcDir, rec); preflightErr != nil { return nil, fmt.Errorf("snapshot preflight: %w", preflightErr) } + oldShape := shapeFromConfig(rec.Config) if killErr := spec.Kill(ctx, vmID, rec); killErr != nil { return nil, killErr } + b.emitRestoreComputeStop(ctx, vmID, oldShape, spec.SourceSnapshotID) var result *types.VM inner := func() error { @@ -148,6 +188,7 @@ func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec } else if innerErr := inner(); innerErr != nil { return nil, innerErr } + b.emitRestoreSuccess(ctx, result, oldShape, spec.SourceSnapshotID) return result, nil } diff --git a/hypervisor/start.go b/hypervisor/start.go index 51fd8f9d..ced97b48 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -5,20 +5,41 @@ import ( "errors" "fmt" "os" + "sync" "github.com/projecteru2/core/log" "github.com/cocoonstack/cocoon/utils" ) -// StartAll runs startOne for each ref and batch-flips the succeeded set to Running so a partial batch doesn't leave half-Running state. -func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) error) ([]string, error) { +// StartAll runs startOne for each ref and batch-flips the actually-launched set to Running. +// startOne returns (launched, err): launched=true iff a fresh process was started; +// false means PrepareStart was a no-op (truly already running). Only launched ids +// reach BatchMarkStarted, so an already-running VM doesn't open a duplicate interval +// while a stale-running record that was actually relaunched still does. +func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) (bool, error)) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { return nil, err } - succeeded, forEachErr := b.ForEachVM(ctx, ids, "Start", startOne) - if batchErr := b.BatchMarkStarted(ctx, succeeded); batchErr != nil { + var ( + mu sync.Mutex + launched []string + ) + wrapped := func(ctx context.Context, id string) error { + wasLaunched, sErr := startOne(ctx, id) + if sErr != nil { + return sErr + } + if wasLaunched { + mu.Lock() + launched = append(launched, id) + mu.Unlock() + } + return nil + } + succeeded, forEachErr := b.ForEachVM(ctx, ids, "Start", wrapped) + if batchErr := b.BatchMarkStarted(ctx, launched); batchErr != nil { log.WithFunc(b.Typ+".Start").Warnf(ctx, "batch state update: %v", batchErr) } return succeeded, forEachErr diff --git a/hypervisor/state.go b/hypervisor/state.go index d3ef4de0..28899cb7 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -11,6 +11,7 @@ import ( "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) @@ -88,17 +89,20 @@ func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume } // UpdateStates batch-updates the State field for ids; sets StartedAt/StoppedAt as appropriate. +// Emits metering vm.compute.stop on Running→Stopped transitions; other state transitions do not emit. func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VMState) error { if len(ids) == 0 { return nil } now := time.Now() - return b.DB.Update(ctx, func(idx *VMIndex) error { + var stopped []metering.Entry + if err := b.DB.Update(ctx, func(idx *VMIndex) error { for _, id := range ids { r := idx.VMs[id] if r == nil { continue } + oldState := r.State r.State = state r.UpdatedAt = now switch state { @@ -106,10 +110,28 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM r.StartedAt = &now case types.VMStateStopped: r.StoppedAt = &now + // Only emit when we actually closed a Running interval; idempotent + // stops (already-Stopped, Error→Stopped) would create phantom intervals. + if oldState == types.VMStateRunning { + stopped = append(stopped, metering.Entry{ + Kind: metering.KindVMComputeStop, + VMID: id, + Reason: metering.ReasonStopUser, + Hypervisor: b.Typ, + Shape: shapeFromConfig(r.Config), + EmittedAt: now, + }) + } } } return nil - }) + }); err != nil { + return err + } + for _, e := range stopped { + b.meter().Emit(ctx, e) + } + return nil } // MarkError flips a single VM's state to VMStateError, logging on persist failure. @@ -120,24 +142,60 @@ func (b *Backend) MarkError(ctx context.Context, id string) { } // BatchMarkStarted flips ids to VMStateRunning and stamps FirstBooted=true in one DB write. +// Caller MUST pass only ids that were actually launched in this call (no-op +// already-Running VMs must be filtered out by the caller). If the DB still +// shows Running when an id arrives here, it's a stale-running record whose +// process had crashed; close that orphan interval with reason stop-crash +// before opening the fresh interval (no precise crash time available — use +// the relaunch boundary). func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { if len(ids) == 0 { return nil } now := time.Now() - return b.DB.Update(ctx, func(idx *VMIndex) error { + var emits []metering.Entry + if err := b.DB.Update(ctx, func(idx *VMIndex) error { for _, id := range ids { r := idx.VMs[id] if r == nil { continue } + shape := shapeFromConfig(r.Config) + if r.State == types.VMStateRunning { + emits = append(emits, metering.Entry{ + Kind: metering.KindVMComputeStop, + VMID: id, + Reason: metering.ReasonStopCrash, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, + }) + } + reason := metering.ReasonBoot + if r.FirstBooted { + reason = metering.ReasonRestart + } + emits = append(emits, metering.Entry{ + Kind: metering.KindVMComputeStart, + VMID: id, + Reason: reason, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, + }) r.State = types.VMStateRunning r.StartedAt = &now r.UpdatedAt = now r.FirstBooted = true } return nil - }) + }); err != nil { + return err + } + for _, e := range emits { + b.meter().Emit(ctx, e) + } + return nil } // CleanStalePlaceholders removes "creating" records past GC grace period. diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go new file mode 100644 index 00000000..f481af22 --- /dev/null +++ b/hypervisor/state_test.go @@ -0,0 +1,386 @@ +package hypervisor + +import ( + "context" + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/metering" + storejson "github.com/cocoonstack/cocoon/storage/json" + "github.com/cocoonstack/cocoon/types" +) + +// stubBackendConfig satisfies BackendConfig for tests that only exercise the +// metering wiring; unused methods panic so accidental dependence shows up loud. +type stubBackendConfig struct{} + +func (stubBackendConfig) BinaryName() string { panic("BinaryName: not implemented in stub") } +func (stubBackendConfig) PIDFileName() string { panic("PIDFileName: not implemented in stub") } +func (stubBackendConfig) TerminateGracePeriod() time.Duration { + panic("TerminateGracePeriod: not implemented in stub") +} + +func (stubBackendConfig) SocketWaitTimeout() time.Duration { + panic("SocketWaitTimeout: not implemented in stub") +} +func (stubBackendConfig) EffectivePoolSize() int { return 1 } +func (stubBackendConfig) IndexFile() string { panic("IndexFile: not implemented in stub") } +func (stubBackendConfig) RunDir() string { panic("RunDir: not implemented in stub") } +func (stubBackendConfig) LogDir() string { panic("LogDir: not implemented in stub") } +func (stubBackendConfig) VMRunDir(string) string { panic("VMRunDir: not implemented in stub") } +func (stubBackendConfig) VMLogDir(string) string { panic("VMLogDir: not implemented in stub") } + +func newMeteringTestBackend(t *testing.T) (*Backend, *metering.CaptureRecorder) { + t.Helper() + dir := t.TempDir() + locker := flock.New(filepath.Join(dir, "index.lock")) + store := storejson.New[VMIndex](filepath.Join(dir, "index.json"), locker) + cap := &metering.CaptureRecorder{} + return &Backend{ + Typ: "test-hv", + Conf: stubBackendConfig{}, + DB: store, + Locker: locker, + Metering: cap, + }, cap +} + +func seedVMRecord(t *testing.T, b *Backend, id string, cpu int, mem, storage int64, firstBooted bool) { + t.Helper() + if err := b.DB.Update(t.Context(), func(idx *VMIndex) error { + idx.VMs[id] = &VMRecord{ + VM: types.VM{ + ID: id, + Hypervisor: b.Typ, + Config: types.VMConfig{Config: types.Config{CPU: cpu, Memory: mem, Storage: storage}}, + FirstBooted: firstBooted, + }, + } + return nil + }); err != nil { + t.Fatalf("seed: %v", err) + } +} + +func TestBatchMarkStartedEmitsComputeStart(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 2, 4<<30, 10<<30, false) + + if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { + t.Fatalf("BatchMarkStarted: %v", err) + } + entries := cap.Entries() + if len(entries) != 1 { + t.Fatalf("got %d entries, want 1", len(entries)) + } + e := entries[0] + if e.Kind != metering.KindVMComputeStart { + t.Errorf("got kind %q, want %q", e.Kind, metering.KindVMComputeStart) + } + if e.Reason != metering.ReasonBoot { + t.Errorf("got reason %q, want boot for first-time start", e.Reason) + } + if e.VMID != "vm1" || e.Hypervisor != "test-hv" { + t.Errorf("identity wrong: %+v", e) + } + if e.Shape.CPU != 2 || e.Shape.MemBytes != 4<<30 || e.Shape.StorageBytes != 10<<30 { + t.Errorf("shape wrong: %+v", e.Shape) + } +} + +func TestBatchMarkStartedReasonRestartWhenAlreadyBooted(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) + + if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { + t.Fatalf("BatchMarkStarted: %v", err) + } + entries := cap.Entries() + if len(entries) != 1 || entries[0].Reason != metering.ReasonRestart { + t.Errorf("got %+v, want one entry with reason restart", entries) + } +} + +func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) + + // Created→Stopped: no Running interval to close, must not emit. + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { + t.Fatalf("UpdateStates(stopped from created): %v", err) + } + if got := cap.Entries(); len(got) != 0 { + t.Errorf("Created→Stopped emitted %d; want 0 (no Running interval to close)", len(got)) + } + + // Stopped→Running: not a stop, must not emit. + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { + t.Fatalf("UpdateStates(running): %v", err) + } + if got := cap.Entries(); len(got) != 0 { + t.Errorf("Stopped→Running emitted %d; want 0", len(got)) + } + + // Running→Stopped: this is the only path that closes a Running interval. + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { + t.Fatalf("UpdateStates(stopped): %v", err) + } + entries := cap.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop { + t.Fatalf("got %+v, want one compute.stop", entries) + } + + // Stopped→Stopped: idempotent, must not duplicate the event. + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { + t.Fatalf("UpdateStates(stopped idempotent): %v", err) + } + if got := cap.Entries(); len(got) != 1 { + t.Errorf("Stopped→Stopped should not re-emit; got %d entries total", len(got)) + } + + // Set Running again, then go through Error (not Stopped). Error must not emit. + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { + t.Fatalf("UpdateStates(running again): %v", err) + } + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { + t.Fatalf("UpdateStates(error): %v", err) + } + if got := cap.Entries(); len(got) != 1 { + t.Errorf("Error state must not emit; got %d entries total", len(got)) + } +} + +func TestFinalizeCloneEmitsCloneEntries(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false) + + info := &types.VM{ + ID: "vm1", + Hypervisor: b.Typ, + State: types.VMStateRunning, + Config: types.VMConfig{Config: types.Config{CPU: 2, Memory: 2 << 30, Storage: 20 << 30}}, + } + if err := b.FinalizeClone(ctx, "vm1", info, nil, nil, "snap-source"); err != nil { + t.Fatalf("FinalizeClone: %v", err) + } + entries := cap.Entries() + if len(entries) != 2 { + t.Fatalf("got %d entries, want 2 (storage.start + compute.start)", len(entries)) + } + for _, e := range entries { + if e.Reason != metering.ReasonClone { + t.Errorf("kind %s reason %q, want clone", e.Kind, e.Reason) + } + if e.SourceSnapshotID != "snap-source" { + t.Errorf("kind %s source_snapshot_id %q, want snap-source", e.Kind, e.SourceSnapshotID) + } + } + if entries[0].Kind != metering.KindVMStorageStart || entries[1].Kind != metering.KindVMComputeStart { + t.Errorf("ordering wrong: %s then %s", entries[0].Kind, entries[1].Kind) + } +} + +func seedRunningVM(t *testing.T, b *Backend, id string, cpu int, mem, storage int64) { + t.Helper() + seedVMRecord(t, b, id, cpu, mem, storage, true) + if err := b.DB.Update(t.Context(), func(idx *VMIndex) error { + idx.VMs[id].State = types.VMStateRunning + return nil + }); err != nil { + t.Fatalf("set running: %v", err) + } +} + +func TestDirectRestoreSequenceEmitsComputeStopThenTransition(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + + newCfg := &types.VMConfig{Config: types.Config{CPU: 4, Memory: 4 << 30, Storage: 30 << 30}} + spec := DirectRestoreSpec{ + VMCfg: newCfg, + SrcDir: t.TempDir(), + SourceSnapshotID: "snap-src", + Preflight: func(string, *VMRecord) error { return nil }, + Kill: func(context.Context, string, *VMRecord) error { return nil }, + Populate: func(*VMRecord, string) error { return nil }, + AfterExtract: func(_ context.Context, vmID string, vmCfg *types.VMConfig, _ *VMRecord) (*types.VM, error) { + return &types.VM{ID: vmID, Hypervisor: b.Typ, State: types.VMStateRunning, Config: *vmCfg}, nil + }, + } + if _, err := b.DirectRestoreSequence(ctx, "vm1", spec); err != nil { + t.Fatalf("DirectRestoreSequence: %v", err) + } + + entries := cap.Entries() + // compute.stop on kill; storage.stop + storage.start + compute.start on success. + if len(entries) != 4 { + t.Fatalf("got %d entries, want 4", len(entries)) + } + wantOrder := []metering.Kind{ + metering.KindVMComputeStop, + metering.KindVMStorageStop, metering.KindVMStorageStart, metering.KindVMComputeStart, + } + for i, want := range wantOrder { + if entries[i].Kind != want { + t.Errorf("entries[%d].Kind = %s, want %s", i, entries[i].Kind, want) + } + if entries[i].Reason != metering.ReasonRestore { + t.Errorf("entries[%d].Reason = %q, want restore", i, entries[i].Reason) + } + if entries[i].SourceSnapshotID != "snap-src" { + t.Errorf("entries[%d].SourceSnapshotID = %q, want snap-src", i, entries[i].SourceSnapshotID) + } + } + // compute.stop and storage.stop carry the old shape; the open pair carries the new shape. + for i := range 2 { + if entries[i].Shape.CPU != 2 { + t.Errorf("close entry %d cpu=%d, want 2 (old shape)", i, entries[i].Shape.CPU) + } + } + for i := 2; i < 4; i++ { + if entries[i].Shape.CPU != 4 { + t.Errorf("open entry %d cpu=%d, want 4 (new shape)", i, entries[i].Shape.CPU) + } + } +} + +func TestDirectRestoreSequenceEmitsOnlyComputeStopOnPopulateFailure(t *testing.T) { + // Storage must stay open when restore fails after kill — the on-disk files + // are still the old shape and vm rm will close it later with reason vm-rm. + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + + spec := DirectRestoreSpec{ + VMCfg: &types.VMConfig{Config: types.Config{CPU: 4, Memory: 4 << 30, Storage: 30 << 30}}, + SrcDir: t.TempDir(), + SourceSnapshotID: "snap-src", + Preflight: func(string, *VMRecord) error { return nil }, + Kill: func(context.Context, string, *VMRecord) error { return nil }, + Populate: func(*VMRecord, string) error { return fmt.Errorf("populate boom") }, + AfterExtract: func(_ context.Context, _ string, _ *types.VMConfig, _ *VMRecord) (*types.VM, error) { + t.Fatal("AfterExtract should not run when Populate fails") + return nil, nil + }, + } + if _, err := b.DirectRestoreSequence(ctx, "vm1", spec); err == nil { + t.Fatal("expected error from populate failure") + } + entries := cap.Entries() + if len(entries) != 1 { + t.Fatalf("got %d entries, want 1 (compute.stop only; storage stays open)", len(entries)) + } + if entries[0].Kind != metering.KindVMComputeStop { + t.Errorf("entries[0].Kind = %s, want compute.stop", entries[0].Kind) + } +} + +func TestStartAllOnlyEmitsForActuallyLaunched(t *testing.T) { + // Three records distinguish the three cases that must end up correctly in + // the ledger: + // - vm-stopped: DB Stopped, process dead → launched=true → emit + // - vm-running: DB Running, process alive → launched=false → no emit + // - vm-stale: DB Running, process dead, relaunched → launched=true → emit + // The bug being locked down: an earlier impl had BatchMarkStarted skip + // anything with r.State==Running, which silently dropped vm-stale. + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm-stopped", 1, 1<<30, 10<<30, false) + seedRunningVM(t, b, "vm-running", 1, 1<<30, 10<<30) + seedRunningVM(t, b, "vm-stale", 2, 2<<30, 20<<30) + + startOne := func(_ context.Context, id string) (bool, error) { + switch id { + case "vm-stopped", "vm-stale": + return true, nil + case "vm-running": + return false, nil + } + return false, fmt.Errorf("unexpected id: %s", id) + } + + succeeded, err := b.StartAll(ctx, []string{"vm-stopped", "vm-running", "vm-stale"}, startOne) + if err != nil { + t.Fatalf("StartAll: %v", err) + } + if len(succeeded) != 3 { + t.Errorf("succeeded %v, want 3", succeeded) + } + + entries := cap.Entries() + // vm-stopped → 1 entry (compute.start) + // vm-running → 0 entries (no-op) + // vm-stale → 2 entries (compute.stop reason=stop-crash + compute.start reason=restart) + if len(entries) != 3 { + t.Fatalf("got %d entries, want 3 (vm-stopped: start; vm-stale: stop-crash + start; vm-running: none)", len(entries)) + } + byVM := map[string][]metering.Entry{} + for _, e := range entries { + byVM[e.VMID] = append(byVM[e.VMID], e) + } + if got := byVM["vm-running"]; len(got) != 0 { + t.Errorf("vm-running emitted %d entries; want 0", len(got)) + } + if got := byVM["vm-stopped"]; len(got) != 1 || got[0].Kind != metering.KindVMComputeStart || got[0].Reason != metering.ReasonBoot { + t.Errorf("vm-stopped: got %+v, want 1× compute.start reason=boot", got) + } + stale := byVM["vm-stale"] + if len(stale) != 2 { + t.Fatalf("vm-stale: got %d entries, want 2 (stop-crash close + restart open)", len(stale)) + } + if stale[0].Kind != metering.KindVMComputeStop || stale[0].Reason != metering.ReasonStopCrash { + t.Errorf("vm-stale[0]: got kind=%s reason=%q, want compute.stop reason=stop-crash", stale[0].Kind, stale[0].Reason) + } + if stale[1].Kind != metering.KindVMComputeStart || stale[1].Reason != metering.ReasonRestart { + t.Errorf("vm-stale[1]: got kind=%s reason=%q, want compute.start reason=restart", stale[1].Kind, stale[1].Reason) + } +} + +func TestFinalizeCreateEmitsStorageStart(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + // FinalizeCreate requires an existing placeholder. + seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false) + + info := &types.VM{ + ID: "vm1", + Hypervisor: b.Typ, + Config: types.VMConfig{Config: types.Config{CPU: 2, Memory: 2 << 30, Storage: 20 << 30}}, + } + if err := b.FinalizeCreate(ctx, "vm1", info, nil, nil); err != nil { + t.Fatalf("FinalizeCreate: %v", err) + } + entries := cap.Entries() + if len(entries) != 1 { + t.Fatalf("got %d entries, want 1", len(entries)) + } + e := entries[0] + if e.Kind != metering.KindVMStorageStart || e.VMID != "vm1" || e.Reason != metering.ReasonBoot { + t.Errorf("got %+v, want storage.start vm1 reason boot", e) + } + if e.Shape.StorageBytes != 20<<30 { + t.Errorf("got storage %d, want %d", e.Shape.StorageBytes, int64(20<<30)) + } +} + +func TestEmitNilSafeWithoutRecorder(t *testing.T) { + // b.Metering left nil — meter() must return NopRecorder so emit doesn't panic. + dir := t.TempDir() + locker := flock.New(filepath.Join(dir, "index.lock")) + store := storejson.New[VMIndex](filepath.Join(dir, "index.json"), locker) + b := &Backend{Typ: "test-hv", DB: store, Locker: locker} + + ctx := t.Context() + seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, false) + if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { + t.Errorf("BatchMarkStarted with nil Metering panicked/failed: %v", err) + } +} diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 5077fb89..3b8a3d1e 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -8,6 +8,7 @@ import ( "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) @@ -57,10 +58,15 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop return loadErr } sockPath := SocketPath(rec.RunDir) + // stoppedByUs distinguishes "we just killed a live process" from "the DB + // record was still Running but the process had already crashed"; the + // former yields ReasonStopUser, the latter ReasonStopCrash. + stoppedByUs := false if runningErr := b.WithRunningVM(ctx, &rec, func(_ int) error { if !force { return fmt.Errorf("running (force required)") } + stoppedByUs = true return stopOne(ctx, id) }); runningErr != nil && !errors.Is(runningErr, ErrNotRunning) { return fmt.Errorf("stop before delete: %w", runningErr) @@ -89,15 +95,52 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop if rmErr := RemoveVMDirs(rec.RunDir, rec.LogDir); rmErr != nil { return fmt.Errorf("cleanup VM dirs: %w", rmErr) } - return b.DB.Update(ctx, func(idx *VMIndex) error { + var ( + shape metering.Shape + hadRunningInterval bool + ) + // Capture state and shape inside the same transaction that deletes the + // record so a concurrent UpdateStates can't shift the truth between read + // and emit. + if err := b.DB.Update(ctx, func(idx *VMIndex) error { r := idx.VMs[id] if r == nil { return ErrNotFound } + hadRunningInterval = r.State == types.VMStateRunning + shape = shapeFromConfig(r.Config) delete(idx.Names, r.Config.Name) delete(idx.VMs, id) return nil + }); err != nil { + return err + } + now := time.Now() + // DeleteAll bypasses StopAll, so the compute.stop event must be emitted + // here whenever the record carried an open Running interval. + if hadRunningInterval { + reason := metering.ReasonStopCrash + if stoppedByUs { + reason = metering.ReasonStopUser + } + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMComputeStop, + VMID: id, + Reason: reason, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, + }) + } + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStop, + VMID: id, + Reason: metering.ReasonVMRemove, + Hypervisor: b.Typ, + Shape: shape, + EmittedAt: now, }) + return nil }) } diff --git a/metering/file.go b/metering/file.go index 4c534d45..f8fe4214 100644 --- a/metering/file.go +++ b/metering/file.go @@ -9,6 +9,11 @@ import ( "github.com/projecteru2/core/log" ) +// POSIX guarantees a single write(2) to an O_APPEND file is atomic relative +// to other writes from any process; concatenating JSON+newline into one Write +// keeps the ledger valid even when multiple cocoon CLI processes append +// concurrently. (Mutex below only serializes the writes from a single process.) + // FileRecorder appends JSON-encoded entries (one per line) to a file under sync.Mutex. type FileRecorder struct { mu sync.Mutex @@ -25,21 +30,17 @@ func NewFileRecorder(ctx context.Context, path string) Recorder { return &FileRecorder{f: f} } -// Emit marshals e and appends one line; write errors are logged and swallowed so the caller's state machine is never blocked. +// Emit marshals e and appends one line atomically; write errors are logged and swallowed so the caller's state machine is never blocked. func (r *FileRecorder) Emit(ctx context.Context, e Entry) { data, err := json.Marshal(e) if err != nil { log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "marshal entry: %v", err) return } + data = append(data, '\n') r.mu.Lock() defer r.mu.Unlock() - // Two writes are safe under mu.Lock; without the mutex the newline could interleave with another emit. if _, err := r.f.Write(data); err != nil { log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "write entry: %v", err) - return - } - if _, err := r.f.WriteString("\n"); err != nil { - log.WithFunc("metering.FileRecorder.Emit").Warnf(ctx, "write newline: %v", err) } } diff --git a/metering/metering.go b/metering/metering.go index 7fb989cd..cf44c74b 100644 --- a/metering/metering.go +++ b/metering/metering.go @@ -62,3 +62,11 @@ type NopRecorder struct{} // Emit is a no-op. func (NopRecorder) Emit(context.Context, Entry) {} + +// OrNop returns r unchanged when non-nil, NopRecorder otherwise so emit sites never have to nil-check. +func OrNop(r Recorder) Recorder { + if r == nil { + return NopRecorder{} + } + return r +} diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 837ae9b6..cf529329 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -13,6 +13,7 @@ import ( "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/lock" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/storage" "github.com/cocoonstack/cocoon/utils" @@ -67,6 +68,7 @@ func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[sn type snapshotMeta struct { name string + hypervisor string lastAccessed time.Time sizeBytes int64 } @@ -83,7 +85,7 @@ type snapshotGCSnapshot struct { func (s snapshotGCSnapshot) UsedBlobIDs() map[string]struct{} { return s.blobIDs } -func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker, policy EvictionPolicy) gc.Module[snapshotGCSnapshot] { +func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker lock.Locker, policy EvictionPolicy, recorder metering.Recorder) gc.Module[snapshotGCSnapshot] { return gc.Module[snapshotGCSnapshot]{ Name: "snapshot", Locker: locker, @@ -108,6 +110,7 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker } snap.records[id] = snapshotMeta{ name: rec.Name, + hypervisor: rec.Hypervisor, lastAccessed: rec.LastAccessedAt, sizeBytes: rec.SizeBytes, } @@ -150,6 +153,7 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker }, Collect: func(ctx context.Context, ids []string, snap snapshotGCSnapshot) error { logger := log.WithFunc("gc.snapshot") + meter := metering.OrNop(recorder) var ( errs []error removed = make([]string, 0, len(ids)) @@ -165,6 +169,17 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker } logEvictRow(ctx, logger, "collected", id, snap.records[id], snap.reasons[id]) removed = append(removed, id) + // Only emit stop for real records that had a corresponding start; + // orphan dirs and stale-pending IDs never opened a snap.storage interval. + if m, ok := snap.records[id]; ok { + meter.Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStop, + SnapshotID: id, + Reason: metering.ReasonSnapRemove, + Hypervisor: m.hypervisor, + EmittedAt: time.Now(), + }) + } } if err := cleanResolvedRecords(store, removed); err != nil { errs = append(errs, fmt.Errorf("clean DB records: %w", err)) diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index ebec41af..7c9c3378 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -9,6 +9,7 @@ import ( "testing" "time" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/types" ) @@ -145,7 +146,7 @@ func TestGCModule_LRUEndToEnd(t *testing.T) { } policy := EvictionPolicy{Enabled: true, MaxAge: 24 * time.Hour} - mod := gcModule(lf.conf, lf.store, lf.locker, policy) + mod := gcModule(lf.conf, lf.store, lf.locker, policy, metering.NopRecorder{}) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatalf("ReadDB: %v", err) @@ -185,7 +186,7 @@ func TestGCModule_DryRunNoEviction(t *testing.T) { } policy := EvictionPolicy{Enabled: true, DryRun: true} - mod := gcModule(lf.conf, lf.store, lf.locker, policy) + mod := gcModule(lf.conf, lf.store, lf.locker, policy, metering.NopRecorder{}) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatal(err) @@ -208,7 +209,7 @@ func TestGCModule_BareSnapshotEvictsAll(t *testing.T) { } } - mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}, metering.NopRecorder{}) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatal(err) @@ -303,7 +304,7 @@ func TestGCModule_RemovalFailureKeepsDBRecord(t *testing.T) { } } - mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}) + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}, metering.NopRecorder{}) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatalf("ReadDB: %v", err) @@ -334,7 +335,7 @@ func TestGCModule_OrphanDirCleaned(t *testing.T) { t.Fatal(err) } - mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}) + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}, metering.NopRecorder{}) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatal(err) @@ -350,3 +351,93 @@ func TestGCModule_OrphanDirCleaned(t *testing.T) { t.Errorf("orphan dir should be removed, stat err: %v", err) } } + +func TestGCModule_EvictRealRecordEmitsSnapStorageStop(t *testing.T) { + // Real (non-pending, non-orphan) records that get LRU-evicted must emit + // snap.storage.stop so the ledger interval closes; otherwise GC silently + // leaks an open snapshot interval forever. + lf := newTestLF(t) + ctx := t.Context() + + for _, name := range []string{"snap-a", "snap-b"} { + id := testID(t) + if _, err := lf.Create(ctx, &types.SnapshotConfig{ID: id, Name: name, Hypervisor: "cloud-hypervisor"}, + makeTar(t, map[string][]byte{"cow.raw": []byte("x")})); err != nil { + t.Fatalf("Create %s: %v", name, err) + } + } + + cap := &metering.CaptureRecorder{} + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}, cap) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if len(ids) != 2 { + t.Fatalf("want 2 ids to evict, got %v", ids) + } + if err := mod.Collect(ctx, ids, snap); err != nil { + t.Fatal(err) + } + + entries := cap.Entries() + if len(entries) != 2 { + t.Fatalf("got %d entries, want 2 (one stop per evicted record)", len(entries)) + } + for _, e := range entries { + if e.Kind != metering.KindSnapStorageStop { + t.Errorf("kind = %s, want snap.storage.stop", e.Kind) + } + if e.Reason != metering.ReasonSnapRemove { + t.Errorf("reason = %q, want snap-rm", e.Reason) + } + if e.Hypervisor != "cloud-hypervisor" { + t.Errorf("hypervisor = %q, want cloud-hypervisor", e.Hypervisor) + } + } +} + +func TestGCModule_OrphanAndStalePendingDoNotEmit(t *testing.T) { + // Neither orphan dirs (no DB record at all) nor stale-pending (record + // exists but never reached snap.storage.start) opened a ledger interval; + // GC must not emit a phantom snap.storage.stop for them. + lf := newTestLF(t) + ctx := t.Context() + + orphanDir := filepath.Join(lf.conf.DataDir(), "ORPHAN_ID") + if err := os.MkdirAll(orphanDir, 0o750); err != nil { + t.Fatal(err) + } + stalePendingID := testID(t) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + idx.Snapshots[stalePendingID] = &snapshot.SnapshotRecord{ + Snapshot: types.Snapshot{ + SnapshotConfig: types.SnapshotConfig{ID: stalePendingID}, + CreatedAt: time.Now().Add(-48 * time.Hour), + }, + Pending: true, + DataDir: filepath.Join(lf.conf.DataDir(), stalePendingID), + } + return nil + }); err != nil { + t.Fatal(err) + } + + cap := &metering.CaptureRecorder{} + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}, cap) + snap, err := mod.ReadDB(ctx) + if err != nil { + t.Fatal(err) + } + ids := mod.Resolve(ctx, snap, map[string]any{}) + if !slices.Contains(ids, "ORPHAN_ID") || !slices.Contains(ids, stalePendingID) { + t.Fatalf("want both orphan and stale-pending picked, got %v", ids) + } + if err := mod.Collect(ctx, ids, snap); err != nil { + t.Fatal(err) + } + if got := cap.Entries(); len(got) != 0 { + t.Errorf("got %d entries; orphan/stale-pending must not emit stop", len(got)) + } +} diff --git a/snapshot/localfile/import.go b/snapshot/localfile/import.go index 37f0f5dc..1ce8a7bf 100644 --- a/snapshot/localfile/import.go +++ b/snapshot/localfile/import.go @@ -11,6 +11,7 @@ import ( "path/filepath" "time" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -82,6 +83,13 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description return "", err } + lf.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStart, + SnapshotID: id, + Hypervisor: cfg.Hypervisor, + Shape: metering.Shape{StorageBytes: size}, + EmittedAt: now, + }) return id, nil } diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 4ba0bd2a..7b3399e4 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -14,6 +14,7 @@ import ( "github.com/cocoonstack/cocoon/gc" "github.com/cocoonstack/cocoon/lock" "github.com/cocoonstack/cocoon/lock/flock" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/storage" storejson "github.com/cocoonstack/cocoon/storage/json" @@ -43,10 +44,12 @@ type LocalFile struct { conf *Config store storage.Store[snapshot.SnapshotIndex] locker lock.Locker + metering metering.Recorder gcPolicy EvictionPolicy } -func New(conf *config.Config, opts ...Option) (*LocalFile, error) { +// New builds a LocalFile snapshot backend; rec may be nil and falls back to NopRecorder on emit. +func New(conf *config.Config, rec metering.Recorder, opts ...Option) (*LocalFile, error) { if conf == nil { return nil, fmt.Errorf("config is nil") } @@ -56,13 +59,18 @@ func New(conf *config.Config, opts ...Option) (*LocalFile, error) { } locker := flock.New(cfg.IndexLock()) store := storejson.New[snapshot.SnapshotIndex](cfg.IndexFile(), locker) - lf := &LocalFile{conf: cfg, store: store, locker: locker} + lf := &LocalFile{conf: cfg, store: store, locker: locker, metering: rec} for _, opt := range opts { opt(lf) } return lf, nil } +// meter returns lf.metering or NopRecorder so emit sites don't repeat nil checks. +func (lf *LocalFile) meter() metering.Recorder { + return metering.OrNop(lf.metering) +} + func (lf *LocalFile) Type() string { return typ } // DataDir returns the local data directory and snapshot config for direct file access. @@ -74,7 +82,7 @@ func (lf *LocalFile) DataDir(ctx context.Context, ref string) (string, types.Sna return rec.DataDir, snapshotRecordToConfig(rec), nil } -// Create stores a snapshot from stream via placeholder→extract→finalize so a mid-flight crash leaves only a pending record for GC. +// Create stores a snapshot from stream via placeholder→extract→finalize so a mid-flight crash leaves only a pending record for GC; emits metering snap.storage.start on success. func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stream io.Reader) (_ string, err error) { id := cfg.ID if id == "" { @@ -138,6 +146,13 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre return "", fmt.Errorf("finalize snapshot: %w", err) } + lf.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStart, + SnapshotID: id, + Hypervisor: cfg.Hypervisor, + Shape: metering.Shape{StorageBytes: size}, + EmittedAt: finalizedAt, + }) return id, nil } @@ -166,6 +181,7 @@ func (lf *LocalFile) Inspect(ctx context.Context, ref string) (*types.Snapshot, } // Delete processes each id atomically (rm dir → DB update). A mid-loop failure leaves any rm-OK-then-DB-fail id as a stale DB record; GC reclaims it. +// Delete removes snapshots by ref; emits metering snap.storage.stop per deleted id. func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error) { var ids []string if err := lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { @@ -181,11 +197,13 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { return deleted, fmt.Errorf("remove data dir %s: %w", id, err) } + var hypType string if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { rec := idx.Snapshots[id] if rec == nil { return nil } + hypType = rec.Hypervisor if rec.Name != "" { delete(idx.Names, rec.Name) } @@ -194,6 +212,13 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error }); err != nil { return deleted, fmt.Errorf("delete DB record %s: %w", id, err) } + lf.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStop, + SnapshotID: id, + Reason: metering.ReasonSnapRemove, + Hypervisor: hypType, + EmittedAt: time.Now(), + }) deleted = append(deleted, id) } return deleted, nil @@ -208,7 +233,7 @@ func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotCon } func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { - gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy)) + gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy, lf.metering)) } // rollbackCreate removes a placeholder snapshot record from the DB. diff --git a/snapshot/localfile/localfile_test.go b/snapshot/localfile/localfile_test.go index edd65885..fc776519 100644 --- a/snapshot/localfile/localfile_test.go +++ b/snapshot/localfile/localfile_test.go @@ -14,6 +14,7 @@ import ( "testing" "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -29,9 +30,15 @@ func testID(t *testing.T) string { // newTestLF creates a LocalFile backed by a temp directory. func newTestLF(t *testing.T) *LocalFile { + t.Helper() + return newTestLFWithRecorder(t, metering.NopRecorder{}) +} + +// newTestLFWithRecorder lets tests inject a CaptureRecorder for emit assertions. +func newTestLFWithRecorder(t *testing.T, rec metering.Recorder) *LocalFile { t.Helper() dir := t.TempDir() - lf, err := New(&config.Config{RootDir: dir}) + lf, err := New(&config.Config{RootDir: dir}, rec) if err != nil { t.Fatalf("New: %v", err) } @@ -64,7 +71,7 @@ func makeTar(t *testing.T, files map[string][]byte) *bytes.Buffer { func TestNew(t *testing.T) { dir := t.TempDir() - lf, err := New(&config.Config{RootDir: dir}) + lf, err := New(&config.Config{RootDir: dir}, metering.NopRecorder{}) if err != nil { t.Fatalf("New: %v", err) } @@ -74,7 +81,7 @@ func TestNew(t *testing.T) { } func TestNew_NilConfig(t *testing.T) { - _, err := New(nil) + _, err := New(nil, metering.NopRecorder{}) if err == nil { t.Fatal("expected error for nil config") } @@ -82,6 +89,76 @@ func TestNew_NilConfig(t *testing.T) { // Create +func TestCreateAndDeleteEmitMetering(t *testing.T) { + cap := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, cap) + ctx := t.Context() + + id, err := lf.Create(ctx, &types.SnapshotConfig{ + ID: testID(t), + Name: "metered-snap", + Hypervisor: "cloud-hypervisor", + }, makeTar(t, map[string][]byte{"cow.raw": []byte("disk")})) + if err != nil { + t.Fatalf("Create: %v", err) + } + + entries := cap.Entries() + if len(entries) != 1 { + t.Fatalf("after Create: got %d entries, want 1", len(entries)) + } + if entries[0].Kind != metering.KindSnapStorageStart || entries[0].SnapshotID != id || + entries[0].Hypervisor != "cloud-hypervisor" || entries[0].Shape.StorageBytes <= 0 { + t.Errorf("snap.storage.start entry wrong: %+v", entries[0]) + } + + if _, err := lf.Delete(ctx, []string{id}); err != nil { + t.Fatalf("Delete: %v", err) + } + entries = cap.Entries() + if len(entries) != 2 { + t.Fatalf("after Delete: got %d entries, want 2", len(entries)) + } + if entries[1].Kind != metering.KindSnapStorageStop || entries[1].SnapshotID != id || + entries[1].Reason != metering.ReasonSnapRemove || entries[1].Hypervisor != "cloud-hypervisor" { + t.Errorf("snap.storage.stop entry wrong: %+v", entries[1]) + } +} + +func TestImportEmitsSnapStorageStart(t *testing.T) { + cap := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, cap) + ctx := t.Context() + + envelope, err := snapshot.MarshalEnvelope(types.SnapshotConfig{ + ID: "src-snap", + Name: "src-name", + Hypervisor: "cloud-hypervisor", + }) + if err != nil { + t.Fatalf("MarshalEnvelope: %v", err) + } + stream := makeTar(t, map[string][]byte{ + snapshot.SnapshotJSONName: envelope, + "cow.raw": []byte("disk-data"), + }) + + id, err := lf.Import(ctx, stream, "imported", "from test") + if err != nil { + t.Fatalf("Import: %v", err) + } + + entries := cap.Entries() + if len(entries) != 1 { + t.Fatalf("got %d entries, want 1 (snap.storage.start)", len(entries)) + } + e := entries[0] + if e.Kind != metering.KindSnapStorageStart || e.SnapshotID != id || + e.Hypervisor != "cloud-hypervisor" || e.Shape.StorageBytes <= 0 { + t.Errorf("entry wrong: %+v", e) + } +} + func TestCreate(t *testing.T) { lf := newTestLF(t) ctx := t.Context() From da4e12c1737c666755c06c18ebbb9a770ee60526 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 11:20:10 +0800 Subject: [PATCH 03/13] fix(metering): suppress phantom snap.storage.stop on concurrent rm race Two cocoon processes racing snapshot rm: flock serializes the store.Update closures, so the loser's closure sees a nil rec. Previously the loser still emitted a snap.storage.stop with an empty Hypervisor field, duplicating the winner's correct event. - Extract Delete's per-id loop body into deleteOne (also testable). - Track deletedRecord via a captured bool in the closure; emit only when this call actually committed the delete. Caller still gets the id in 'deleted' because the on-disk state is correct (data is gone). --- snapshot/localfile/localfile.go | 52 +++++++++++++++++++--------- snapshot/localfile/localfile_test.go | 50 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 17 deletions(-) diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 7b3399e4..fa86d3a1 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -194,24 +194,43 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error var deleted []string for _, id := range ids { - if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { - return deleted, fmt.Errorf("remove data dir %s: %w", id, err) + if err := lf.deleteOne(ctx, id); err != nil { + return deleted, err } - var hypType string - if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { - rec := idx.Snapshots[id] - if rec == nil { - return nil - } - hypType = rec.Hypervisor - if rec.Name != "" { - delete(idx.Names, rec.Name) - } - delete(idx.Snapshots, id) + deleted = append(deleted, id) + } + return deleted, nil +} + +// deleteOne removes one snapshot's data dir + DB record. Idempotent under +// concurrent rm of the same id: if a rival process won the race to delete +// the record, the Update closure sees a nil rec, we still report success to +// the caller (data is gone), but we skip emit so the rival's stop event is +// the only one in the ledger (no phantom with an empty Hypervisor). +func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { + if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { + return fmt.Errorf("remove data dir %s: %w", id, err) + } + var ( + hypType string + deletedRecord bool + ) + if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { + rec := idx.Snapshots[id] + if rec == nil { return nil - }); err != nil { - return deleted, fmt.Errorf("delete DB record %s: %w", id, err) } + deletedRecord = true + hypType = rec.Hypervisor + if rec.Name != "" { + delete(idx.Names, rec.Name) + } + delete(idx.Snapshots, id) + return nil + }); err != nil { + return fmt.Errorf("delete DB record %s: %w", id, err) + } + if deletedRecord { lf.meter().Emit(ctx, metering.Entry{ Kind: metering.KindSnapStorageStop, SnapshotID: id, @@ -219,9 +238,8 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error Hypervisor: hypType, EmittedAt: time.Now(), }) - deleted = append(deleted, id) } - return deleted, nil + return nil } func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotConfig, io.ReadCloser, error) { diff --git a/snapshot/localfile/localfile_test.go b/snapshot/localfile/localfile_test.go index fc776519..2e072198 100644 --- a/snapshot/localfile/localfile_test.go +++ b/snapshot/localfile/localfile_test.go @@ -125,6 +125,56 @@ func TestCreateAndDeleteEmitMetering(t *testing.T) { } } +func TestDeleteOneIdempotentDoesNotEmitTwice(t *testing.T) { + // Two cocoon processes racing snapshot rm: flock serializes the store.Update + // closures, so the loser's closure sees a nil rec. The loser must still + // report success to its caller (the data is gone), but must NOT emit a + // phantom snap.storage.stop with an empty Hypervisor field. We exercise this + // by calling deleteOne twice on the same id (idempotent), simulating the + // loser running its loop body after the winner already committed. + cap := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, cap) + ctx := t.Context() + + id, err := lf.Create(ctx, &types.SnapshotConfig{ + ID: testID(t), Name: "raced", Hypervisor: "cloud-hypervisor", + }, makeTar(t, map[string][]byte{"cow.raw": []byte("x")})) + if err != nil { + t.Fatalf("Create: %v", err) + } + + if err := lf.deleteOne(ctx, id); err != nil { + t.Fatalf("first deleteOne: %v", err) + } + if err := lf.deleteOne(ctx, id); err != nil { + t.Fatalf("second deleteOne (idempotent): %v", err) + } + + // Ledger should hold exactly 2 entries: Create's start and the FIRST + // deleteOne's stop. The second call must not contribute a phantom event. + entries := cap.Entries() + if len(entries) != 2 { + t.Fatalf("got %d entries, want 2 (start + 1× stop); kinds = %v", len(entries), kinds(entries)) + } + if entries[0].Kind != metering.KindSnapStorageStart { + t.Errorf("entries[0] kind = %s, want snap.storage.start", entries[0].Kind) + } + if entries[1].Kind != metering.KindSnapStorageStop { + t.Errorf("entries[1] kind = %s, want snap.storage.stop", entries[1].Kind) + } + if entries[1].Hypervisor != "cloud-hypervisor" { + t.Errorf("stop entry has Hypervisor=%q; phantom emits leak as empty", entries[1].Hypervisor) + } +} + +func kinds(entries []metering.Entry) []metering.Kind { + out := make([]metering.Kind, len(entries)) + for i, e := range entries { + out[i] = e.Kind + } + return out +} + func TestImportEmitsSnapStorageStart(t *testing.T) { cap := &metering.CaptureRecorder{} lf := newTestLFWithRecorder(t, cap) From 11c2014981a2e1293135bf6450685f3522e8ad7f Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 11:36:05 +0800 Subject: [PATCH 04/13] refactor(metering): extract emit helpers, compress comments, tighten timestamp consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKILL.md rule-by-rule pass on the branch: Helpers extracted in hypervisor/metering.go: - makeEntry: collapses the 6-field Entry literal at 4 emit sites (UpdateStates, BatchMarkStarted ×2, FinalizeCreate). - emitAll: fans out a batch of entries through a single meter() lookup (BatchMarkStarted, UpdateStates). - emitOpenInterval: storage.start + compute.start pair used by both FinalizeClone and emitRestoreSuccess; takes now from caller to keep the timestamp consistent with adjacent close events. - emitDeleteClose: storage.stop unconditionally + compute.stop when the record had an open Running interval (DeleteAll body shrinks by ~20 lines). Time-skew fix: emitRestoreSuccess now captures one now and passes it to emitOpenInterval so storage.stop and the open pair share a timestamp. Both emitOpenInterval and emitDeleteClose cache rec := b.meter() before their emit loops (matches emitAll), removing per-emit interface nil-check. Comment cleanup across the branch: - Drop restate-code godocs (NopRecorder.Emit, CaptureRecorder.Emit, hypervisor.meter, hypervisor.shapeFromConfig). - Collapse multi-line godocs to one line where the second line was a paraphrase of the first (FinalizeCreate, FinalizeClone, FinalizeRestore, UpdateStates, StartAll, BatchMarkStarted, emitRestoreComputeStop, Delete, deleteOne, FileRecorder.Emit, metering/file.go header block). - Drop RestoreSequence inline comment that duplicated emitRestoreComputeStop's godoc. - Merge duplicate Delete godoc in snapshot/localfile/localfile.go. File ordering: snapshot/localfile/gc.go — backfillSizeBytes moved next to its sibling helpers (after pickLRU/logEvictRow, before cleanResolvedRecords), snapshotMeta moved up so types appear before the funcs that use them. --- hypervisor/clone.go | 27 ++------------ hypervisor/create.go | 12 ++----- hypervisor/metering.go | 49 +++++++++++++++++++++++-- hypervisor/restore.go | 24 +++---------- hypervisor/start.go | 7 ++-- hypervisor/state.go | 51 ++++++-------------------- hypervisor/stop.go | 36 ++++--------------- metering/capture.go | 3 +- metering/file.go | 8 ++--- metering/metering.go | 1 - snapshot/localfile/gc.go | 63 +++++++++++++++++---------------- snapshot/localfile/localfile.go | 10 ++---- 12 files changed, 111 insertions(+), 180 deletions(-) diff --git a/hypervisor/clone.go b/hypervisor/clone.go index 6d7b9640..289a637e 100644 --- a/hypervisor/clone.go +++ b/hypervisor/clone.go @@ -35,7 +35,7 @@ func (b *Backend) CloneSetup(ctx context.Context, vmID string, vmCfg *types.VMCo return runDir, logDir, now, cleanup, nil } -// AfterExtractFn finalizes a cloned VM after snapshot files are in place; sourceSnapshotID flows through to the metering Entry so downstream can trace lineage. +// AfterExtractFn finalizes a cloned VM after snapshot files are in place; sourceSnapshotID flows through for metering lineage. type AfterExtractFn func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) // DirectCloneBase clones from a local snapshot directory. Used when the snapshot lives on the same host (no tar streaming needed). @@ -85,9 +85,7 @@ func (b *Backend) CloneFromStream( return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now, snapshotConfig.ID) } -// FinalizeClone updates the cloned VM's record in place after restore-and-resume. -// Emits metering vm.storage.start + vm.compute.start with reason clone so the -// new VM has an opening interval even though it skipped Create/BatchMarkStarted. +// FinalizeClone persists the cloned VM record and emits the open-interval pair (storage.start + compute.start, reason=clone). func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}, sourceSnapshotID string) error { if err := b.DB.Update(ctx, func(idx *VMIndex) error { r, err := idx.GetRecord(vmID) @@ -104,25 +102,6 @@ func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM }); err != nil { return err } - now := time.Now() - shape := shapeFromConfig(info.Config) - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStart, - VMID: vmID, - SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonClone, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMComputeStart, - VMID: vmID, - SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonClone, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) + b.emitOpenInterval(ctx, info, metering.ReasonClone, sourceSnapshotID, time.Now()) return nil } diff --git a/hypervisor/create.go b/hypervisor/create.go index b5c16e08..2ecdba71 100644 --- a/hypervisor/create.go +++ b/hypervisor/create.go @@ -49,8 +49,7 @@ func (b *Backend) RollbackCreate(ctx context.Context, id, name string) { } } -// FinalizeCreate writes a populated VM record to DB, replacing the placeholder. -// Emits metering vm.storage.start once the record is persisted. +// FinalizeCreate persists the populated VM record (replacing the placeholder) and emits metering vm.storage.start. func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}) error { if err := b.DB.Update(ctx, func(idx *VMIndex) error { existing, err := idx.GetRecord(id) @@ -68,14 +67,7 @@ func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, }); err != nil { return err } - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStart, - VMID: id, - Reason: metering.ReasonBoot, - Hypervisor: b.Typ, - Shape: shapeFromConfig(info.Config), - EmittedAt: time.Now(), - }) + b.meter().Emit(ctx, b.makeEntry(metering.KindVMStorageStart, id, metering.ReasonBoot, shapeFromConfig(info.Config), time.Now())) return nil } diff --git a/hypervisor/metering.go b/hypervisor/metering.go index 0361cf00..e5904b24 100644 --- a/hypervisor/metering.go +++ b/hypervisor/metering.go @@ -1,16 +1,17 @@ package hypervisor import ( + "context" + "time" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" ) -// meter returns Backend.Metering or NopRecorder so emit sites can call .Emit without nil-checking. func (b *Backend) meter() metering.Recorder { return metering.OrNop(b.Metering) } -// shapeFromConfig builds a metering.Shape from a VMConfig's billable fields. func shapeFromConfig(c types.VMConfig) metering.Shape { return metering.Shape{ CPU: c.CPU, @@ -18,3 +19,47 @@ func shapeFromConfig(c types.VMConfig) metering.Shape { StorageBytes: c.Storage, } } + +// makeEntry builds a VM-scoped entry stamped with this backend's Hypervisor type; emit sites that don't need SourceSnapshotID use it to skip the field-by-field boilerplate. +func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Reason, shape metering.Shape, now time.Time) metering.Entry { + return metering.Entry{ + Kind: kind, VMID: vmID, Reason: reason, + Hypervisor: b.Typ, Shape: shape, EmittedAt: now, + } +} + +// emitAll fans out a batch of entries through one Recorder lookup. +func (b *Backend) emitAll(ctx context.Context, entries []metering.Entry) { + rec := b.meter() + for _, e := range entries { + rec.Emit(ctx, e) + } +} + +// emitOpenInterval emits the storage.start + compute.start pair that opens a fresh interval for cloned or restored VMs; the caller's now keeps the timestamp consistent with adjacent close events. +func (b *Backend) emitOpenInterval(ctx context.Context, vm *types.VM, reason metering.Reason, sourceSnapshotID string, now time.Time) { + rec := b.meter() + shape := shapeFromConfig(vm.Config) + for _, kind := range []metering.Kind{metering.KindVMStorageStart, metering.KindVMComputeStart} { + rec.Emit(ctx, metering.Entry{ + Kind: kind, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, + Reason: reason, Hypervisor: b.Typ, Shape: shape, EmittedAt: now, + }) + } +} + +// emitDeleteClose emits storage.stop unconditionally and compute.stop when the record had an open Running interval. +func (b *Backend) emitDeleteClose(ctx context.Context, vmID string, shape metering.Shape, computeReason metering.Reason, hadRunningInterval bool) { + now := time.Now() + rec := b.meter() + if hadRunningInterval { + rec.Emit(ctx, metering.Entry{ + Kind: metering.KindVMComputeStop, VMID: vmID, Reason: computeReason, + Hypervisor: b.Typ, Shape: shape, EmittedAt: now, + }) + } + rec.Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStop, VMID: vmID, Reason: metering.ReasonVMRemove, + Hypervisor: b.Typ, Shape: shape, EmittedAt: now, + }) +} diff --git a/hypervisor/restore.go b/hypervisor/restore.go index 5a163a16..20914f29 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -40,9 +40,7 @@ func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, return vmID, &rec, nil } -// FinalizeRestore updates DB and assembles the returned VM after restore. Metering -// emit lives in RestoreSequence/DirectRestoreSequence so the prior compute interval -// is closed at the kill boundary, not only on full-restore success. +// FinalizeRestore updates DB and assembles the returned VM; metering lives in (Direct)RestoreSequence so the close fires at the kill boundary, not only on full success. func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord, pid int) (*types.VM, error) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { @@ -69,10 +67,7 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types return &info, nil } -// emitRestoreComputeStop closes only the prior compute interval at the kill boundary. -// Storage is NOT closed here because the on-disk files survive: a downstream restore -// failure leaves the VM in Error state with its old storage intact, and vm rm will -// later close the storage interval with reason vm-rm. +// emitRestoreComputeStop closes only the compute interval at the kill boundary; storage stays open so a restore failure leaves on-disk files intact for vm rm to close later. func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldShape metering.Shape, sourceSnapshotID string) { b.meter().Emit(ctx, metering.Entry{ Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, @@ -80,21 +75,14 @@ func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldSh }) } -// emitRestoreSuccess emits the storage shape transition and reopens the compute -// interval; called only after the restore sequence has fully succeeded. +// emitRestoreSuccess closes the old storage interval and opens fresh storage+compute; called only after restore fully succeeds. func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { now := time.Now() - newShape := shapeFromConfig(vm.Config) b.meter().Emit(ctx, metering.Entry{ Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, }) - for _, kind := range []metering.Kind{metering.KindVMStorageStart, metering.KindVMComputeStart} { - b.meter().Emit(ctx, metering.Entry{ - Kind: kind, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: newShape, EmittedAt: now, - }) - } + b.emitOpenInterval(ctx, vm, metering.ReasonRestore, sourceSnapshotID, now) } // RestoreSequence is the shared restore skeleton. Staging happens before the kill so a preflight failure leaves the original VM running. @@ -120,10 +108,6 @@ func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec Restor if killErr := spec.Kill(ctx, vmID, rec); killErr != nil { return nil, killErr } - // kill succeeded → the prior compute interval is over regardless of restore - // outcome; emit compute.stop immediately. Storage stays open: if restore - // fails the on-disk files are still the old shape and vm rm later closes - // the storage interval with reason vm-rm. b.emitRestoreComputeStop(ctx, vmID, oldShape, spec.SourceSnapshotID) var result *types.VM diff --git a/hypervisor/start.go b/hypervisor/start.go index ced97b48..f1476cc9 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -12,11 +12,8 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// StartAll runs startOne for each ref and batch-flips the actually-launched set to Running. -// startOne returns (launched, err): launched=true iff a fresh process was started; -// false means PrepareStart was a no-op (truly already running). Only launched ids -// reach BatchMarkStarted, so an already-running VM doesn't open a duplicate interval -// while a stale-running record that was actually relaunched still does. +// StartAll runs startOne per ref; only ids that returned launched=true reach BatchMarkStarted, +// so already-running no-ops don't open duplicate intervals. func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) (bool, error)) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { diff --git a/hypervisor/state.go b/hypervisor/state.go index 28899cb7..2b8be1bf 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -88,8 +88,7 @@ func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume }) } -// UpdateStates batch-updates the State field for ids; sets StartedAt/StoppedAt as appropriate. -// Emits metering vm.compute.stop on Running→Stopped transitions; other state transitions do not emit. +// UpdateStates batch-updates State + StartedAt/StoppedAt; emits metering vm.compute.stop only on Running→Stopped. func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VMState) error { if len(ids) == 0 { return nil @@ -110,17 +109,9 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM r.StartedAt = &now case types.VMStateStopped: r.StoppedAt = &now - // Only emit when we actually closed a Running interval; idempotent - // stops (already-Stopped, Error→Stopped) would create phantom intervals. + // Only Running→Stopped closes a real interval; idempotent stops would emit a phantom. if oldState == types.VMStateRunning { - stopped = append(stopped, metering.Entry{ - Kind: metering.KindVMComputeStop, - VMID: id, - Reason: metering.ReasonStopUser, - Hypervisor: b.Typ, - Shape: shapeFromConfig(r.Config), - EmittedAt: now, - }) + stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopUser, shapeFromConfig(r.Config), now)) } } } @@ -128,9 +119,7 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM }); err != nil { return err } - for _, e := range stopped { - b.meter().Emit(ctx, e) - } + b.emitAll(ctx, stopped) return nil } @@ -141,13 +130,9 @@ func (b *Backend) MarkError(ctx context.Context, id string) { } } -// BatchMarkStarted flips ids to VMStateRunning and stamps FirstBooted=true in one DB write. -// Caller MUST pass only ids that were actually launched in this call (no-op -// already-Running VMs must be filtered out by the caller). If the DB still -// shows Running when an id arrives here, it's a stale-running record whose -// process had crashed; close that orphan interval with reason stop-crash -// before opening the fresh interval (no precise crash time available — use -// the relaunch boundary). +// BatchMarkStarted flips ids to VMStateRunning. Caller MUST pass only actually-launched ids; +// an id arriving here with DB State==Running is a stale-running record (process had crashed) — +// close that orphan with reason stop-crash before opening the fresh interval. func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { if len(ids) == 0 { return nil @@ -162,27 +147,13 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { } shape := shapeFromConfig(r.Config) if r.State == types.VMStateRunning { - emits = append(emits, metering.Entry{ - Kind: metering.KindVMComputeStop, - VMID: id, - Reason: metering.ReasonStopCrash, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) + emits = append(emits, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shape, now)) } reason := metering.ReasonBoot if r.FirstBooted { reason = metering.ReasonRestart } - emits = append(emits, metering.Entry{ - Kind: metering.KindVMComputeStart, - VMID: id, - Reason: reason, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) + emits = append(emits, b.makeEntry(metering.KindVMComputeStart, id, reason, shape, now)) r.State = types.VMStateRunning r.StartedAt = &now r.UpdatedAt = now @@ -192,9 +163,7 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { }); err != nil { return err } - for _, e := range emits { - b.meter().Emit(ctx, e) - } + b.emitAll(ctx, emits) return nil } diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 3b8a3d1e..59c97c8e 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -58,9 +58,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop return loadErr } sockPath := SocketPath(rec.RunDir) - // stoppedByUs distinguishes "we just killed a live process" from "the DB - // record was still Running but the process had already crashed"; the - // former yields ReasonStopUser, the latter ReasonStopCrash. + // stoppedByUs distinguishes user-stop (ReasonStopUser) from orphan-crash cleanup (ReasonStopCrash). stoppedByUs := false if runningErr := b.WithRunningVM(ctx, &rec, func(_ int) error { if !force { @@ -99,9 +97,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop shape metering.Shape hadRunningInterval bool ) - // Capture state and shape inside the same transaction that deletes the - // record so a concurrent UpdateStates can't shift the truth between read - // and emit. + // Capture in the same transaction as delete so a concurrent UpdateStates can't shift the truth. if err := b.DB.Update(ctx, func(idx *VMIndex) error { r := idx.VMs[id] if r == nil { @@ -115,31 +111,11 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop }); err != nil { return err } - now := time.Now() - // DeleteAll bypasses StopAll, so the compute.stop event must be emitted - // here whenever the record carried an open Running interval. - if hadRunningInterval { - reason := metering.ReasonStopCrash - if stoppedByUs { - reason = metering.ReasonStopUser - } - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMComputeStop, - VMID: id, - Reason: reason, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) + computeReason := metering.ReasonStopCrash + if stoppedByUs { + computeReason = metering.ReasonStopUser } - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStop, - VMID: id, - Reason: metering.ReasonVMRemove, - Hypervisor: b.Typ, - Shape: shape, - EmittedAt: now, - }) + b.emitDeleteClose(ctx, id, shape, computeReason, hadRunningInterval) return nil }) } diff --git a/metering/capture.go b/metering/capture.go index 3e877a60..dce92b4d 100644 --- a/metering/capture.go +++ b/metering/capture.go @@ -11,14 +11,13 @@ type CaptureRecorder struct { entries []Entry } -// Emit appends e to the buffer. func (r *CaptureRecorder) Emit(_ context.Context, e Entry) { r.mu.Lock() defer r.mu.Unlock() r.entries = append(r.entries, e) } -// Entries returns a snapshot copy of accumulated entries. +// Entries returns a snapshot copy so callers can mutate freely. func (r *CaptureRecorder) Entries() []Entry { r.mu.Lock() defer r.mu.Unlock() diff --git a/metering/file.go b/metering/file.go index f8fe4214..ed18da06 100644 --- a/metering/file.go +++ b/metering/file.go @@ -9,12 +9,8 @@ import ( "github.com/projecteru2/core/log" ) -// POSIX guarantees a single write(2) to an O_APPEND file is atomic relative -// to other writes from any process; concatenating JSON+newline into one Write -// keeps the ledger valid even when multiple cocoon CLI processes append -// concurrently. (Mutex below only serializes the writes from a single process.) - // FileRecorder appends JSON-encoded entries (one per line) to a file under sync.Mutex. +// POSIX guarantees single write(2) to O_APPEND is atomic across processes; the mutex serializes the in-process writes. type FileRecorder struct { mu sync.Mutex f *os.File @@ -30,7 +26,7 @@ func NewFileRecorder(ctx context.Context, path string) Recorder { return &FileRecorder{f: f} } -// Emit marshals e and appends one line atomically; write errors are logged and swallowed so the caller's state machine is never blocked. +// Emit logs and swallows write errors so the caller's state machine is never blocked. func (r *FileRecorder) Emit(ctx context.Context, e Entry) { data, err := json.Marshal(e) if err != nil { diff --git a/metering/metering.go b/metering/metering.go index cf44c74b..69135bd8 100644 --- a/metering/metering.go +++ b/metering/metering.go @@ -60,7 +60,6 @@ type Recorder interface { // NopRecorder discards every entry; zero value is usable. type NopRecorder struct{} -// Emit is a no-op. func (NopRecorder) Emit(context.Context, Entry) {} // OrNop returns r unchanged when non-nil, NopRecorder otherwise so emit sites never have to nil-check. diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index cf529329..663c2c58 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -35,37 +35,6 @@ func (p EvictionPolicy) hasCriteria() bool { return p.KeepLast > 0 || p.MaxAge > 0 || p.MaxSize > 0 } -func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { - logger := log.WithFunc("gc.snapshot") - var changed bool - for id, m := range records { - if m.sizeBytes > 0 { - continue - } - actual, err := utils.DirSize(conf.SnapshotDataDir(id)) - if err != nil { - logger.Warnf(ctx, "DirSize for %s: %v", id, err) - continue - } - m.sizeBytes = actual - records[id] = m - changed = true - } - if !changed { - return - } - if err := store.WriteRaw(func(idx *snapshot.SnapshotIndex) error { - for id, m := range records { - if r := idx.Snapshots[id]; r != nil && r.SizeBytes != m.sizeBytes { - r.SizeBytes = m.sizeBytes - } - } - return nil - }); err != nil { - logger.Warnf(ctx, "persist backfilled SizeBytes: %v", err) - } -} - type snapshotMeta struct { name string hypervisor string @@ -263,6 +232,38 @@ func logEvictRow(ctx context.Context, logger *log.Fields, verb, id string, m sna verb, id, m.name, m.sizeBytes, accessed, reason) } +// backfillSizeBytes fills in sizeBytes for any record whose SizeBytes wasn't persisted, then writes the resolved values back so future GC runs can skip the du. +func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { + logger := log.WithFunc("gc.snapshot") + var changed bool + for id, m := range records { + if m.sizeBytes > 0 { + continue + } + actual, err := utils.DirSize(conf.SnapshotDataDir(id)) + if err != nil { + logger.Warnf(ctx, "DirSize for %s: %v", id, err) + continue + } + m.sizeBytes = actual + records[id] = m + changed = true + } + if !changed { + return + } + if err := store.WriteRaw(func(idx *snapshot.SnapshotIndex) error { + for id, m := range records { + if r := idx.Snapshots[id]; r != nil && r.SizeBytes != m.sizeBytes { + r.SizeBytes = m.sizeBytes + } + } + return nil + }); err != nil { + logger.Warnf(ctx, "persist backfilled SizeBytes: %v", err) + } +} + // cleanResolvedRecords drops resolved records; pending only past grace. func cleanResolvedRecords(store storage.Store[snapshot.SnapshotIndex], ids []string) error { if len(ids) == 0 { diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index fa86d3a1..e41b1a70 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -66,7 +66,6 @@ func New(conf *config.Config, rec metering.Recorder, opts ...Option) (*LocalFile return lf, nil } -// meter returns lf.metering or NopRecorder so emit sites don't repeat nil checks. func (lf *LocalFile) meter() metering.Recorder { return metering.OrNop(lf.metering) } @@ -180,8 +179,7 @@ func (lf *LocalFile) Inspect(ctx context.Context, ref string) (*types.Snapshot, return &s, nil } -// Delete processes each id atomically (rm dir → DB update). A mid-loop failure leaves any rm-OK-then-DB-fail id as a stale DB record; GC reclaims it. -// Delete removes snapshots by ref; emits metering snap.storage.stop per deleted id. +// Delete removes each ref atomically (rm dir → DB update) and emits snap.storage.stop per deleted id; a mid-loop rm-OK-then-DB-fail leaves a stale DB record that GC reclaims. func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error) { var ids []string if err := lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { @@ -202,11 +200,7 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error return deleted, nil } -// deleteOne removes one snapshot's data dir + DB record. Idempotent under -// concurrent rm of the same id: if a rival process won the race to delete -// the record, the Update closure sees a nil rec, we still report success to -// the caller (data is gone), but we skip emit so the rival's stop event is -// the only one in the ledger (no phantom with an empty Hypervisor). +// deleteOne removes one snapshot atomically; idempotent under concurrent rm — if the rival wins the DB race we report success (data is gone) but skip emit, so the ledger holds exactly one stop event per snapshot. func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { return fmt.Errorf("remove data dir %s: %w", id, err) From 92a509083b328cf77296aba2cea021a895946500 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 11:59:04 +0800 Subject: [PATCH 05/13] refactor: share startOne/stopOne/NewBackend skeletons - hypervisor.StartSequence/StopOneSequence/NewBackend collapse the CH+FC per-id start/stop and ctor boilerplate into ~10-line backend shims - snapshot/localfile/metering.go centralizes the 4 Create/Import/Delete/GC snapshot.{start,stop} emit sites - compress 3 multi-line godoc blocks to single lines per project style --- hypervisor/backend.go | 33 +++++++++++++++++ hypervisor/cloudhypervisor/cloudhypervisor.go | 20 +++-------- hypervisor/cloudhypervisor/start.go | 35 +++++-------------- hypervisor/cloudhypervisor/stop.go | 24 +++++-------- hypervisor/firecracker/firecracker.go | 20 +++-------- hypervisor/firecracker/start.go | 35 +++++-------------- hypervisor/firecracker/stop.go | 24 +++++-------- hypervisor/start.go | 33 +++++++++++++++-- hypervisor/state.go | 4 +-- hypervisor/state_test.go | 2 ++ hypervisor/stop.go | 13 +++++++ metering/file.go | 3 +- snapshot/localfile/gc.go | 11 ++---- snapshot/localfile/import.go | 9 +---- snapshot/localfile/localfile.go | 16 ++------- snapshot/localfile/metering.go | 22 ++++++++++++ 16 files changed, 151 insertions(+), 153 deletions(-) create mode 100644 snapshot/localfile/metering.go diff --git a/hypervisor/backend.go b/hypervisor/backend.go index 320ff989..c90fabfc 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -2,14 +2,17 @@ package hypervisor import ( "context" + "fmt" "io" "net/http" "os/exec" "time" "github.com/cocoonstack/cocoon/lock" + "github.com/cocoonstack/cocoon/lock/flock" "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/storage" + storejson "github.com/cocoonstack/cocoon/storage/json" "github.com/cocoonstack/cocoon/types" ) @@ -50,6 +53,8 @@ type BackendConfig interface { SocketWaitTimeout() time.Duration EffectivePoolSize() int IndexFile() string + IndexLock() string + EnsureDirs() error RunDir() string LogDir() string VMRunDir(id string) string @@ -65,6 +70,21 @@ type Backend struct { Metering metering.Recorder } +// NewBackend wires the shared init boilerplate: EnsureDirs, flock, JSON store. Each backend's New just builds its Config and forwards it here. +func NewBackend(typ string, conf BackendConfig, rec metering.Recorder) (*Backend, error) { + if err := conf.EnsureDirs(); err != nil { + return nil, fmt.Errorf("ensure dirs: %w", err) + } + locker := flock.New(conf.IndexLock()) + return &Backend{ + Typ: typ, + Conf: conf, + DB: storejson.New[VMIndex](conf.IndexFile(), locker), + Locker: locker, + Metering: rec, + }, nil +} + // LaunchSpec is the per-call input to Backend.LaunchVMProcess. Shared // BinaryName / SocketWaitTimeout come from BackendConfig. type LaunchSpec struct { @@ -99,6 +119,19 @@ type DirectRestoreSpec struct { AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } +// StartSpec carries StartSequence inputs: Launch builds + exec the VMM, PostLaunch is optional config-via-REST that the backend rolls back via AbortLaunch on failure. +type StartSpec struct { + RuntimeFiles []string + Launch func(ctx context.Context, rec *VMRecord, sockPath string) (int, error) + PostLaunch func(ctx context.Context, rec *VMRecord, sockPath string, pid int) error +} + +// StopSpec carries StopOneSequence inputs: Shutdown picks the backend-specific path (force vs graceful) once WithRunningVM confirms the process is live. +type StopSpec struct { + RuntimeFiles []string + Shutdown func(ctx context.Context, rec *VMRecord, sockPath string, pid int) error +} + // CreateSpec carries CreateSequence inputs; Prepare returns final storage configs (COW + data disks). type CreateSpec struct { VMCfg *types.VMConfig diff --git a/hypervisor/cloudhypervisor/cloudhypervisor.go b/hypervisor/cloudhypervisor/cloudhypervisor.go index 04943582..e1b31945 100644 --- a/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -6,9 +6,7 @@ import ( "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/hypervisor" - "github.com/cocoonstack/cocoon/lock/flock" "github.com/cocoonstack/cocoon/metering" - storejson "github.com/cocoonstack/cocoon/storage/json" ) const typ = "cloud-hypervisor" @@ -32,21 +30,11 @@ func New(conf *config.Config, rec metering.Recorder) (*CloudHypervisor, error) { return nil, fmt.Errorf("config is nil") } cfg := NewConfig(conf) - if err := cfg.EnsureDirs(); err != nil { - return nil, fmt.Errorf("ensure dirs: %w", err) + backend, err := hypervisor.NewBackend(typ, cfg, rec) + if err != nil { + return nil, err } - locker := flock.New(cfg.IndexLock()) - store := storejson.New[hypervisor.VMIndex](cfg.IndexFile(), locker) - return &CloudHypervisor{ - Backend: &hypervisor.Backend{ - Typ: typ, - Conf: cfg, - DB: store, - Locker: locker, - Metering: rec, - }, - conf: cfg, - }, nil + return &CloudHypervisor{Backend: backend, conf: cfg}, nil } // Delete removes VMs. Running VMs require force=true (stops them first). diff --git a/hypervisor/cloudhypervisor/start.go b/hypervisor/cloudhypervisor/start.go index a951a879..b153a30e 100644 --- a/hypervisor/cloudhypervisor/start.go +++ b/hypervisor/cloudhypervisor/start.go @@ -2,7 +2,6 @@ package cloudhypervisor import ( "context" - "fmt" "os" "os/exec" "syscall" @@ -10,7 +9,6 @@ import ( "github.com/projecteru2/core/log" "github.com/cocoonstack/cocoon/hypervisor" - "github.com/cocoonstack/cocoon/types" ) func (ch *CloudHypervisor) Start(ctx context.Context, refs []string) ([]string, error) { @@ -18,30 +16,15 @@ func (ch *CloudHypervisor) Start(ctx context.Context, refs []string) ([]string, } func (ch *CloudHypervisor) startOne(ctx context.Context, id string) (bool, error) { - rec, err := ch.PrepareStart(ctx, id, runtimeFiles) - if err != nil { - return false, err - } - if rec == nil { - return false, nil // already running — no-op - } - if vErr := types.ValidateStorageConfigs(rec.StorageConfigs); vErr != nil { - ch.MarkError(ctx, id) - return false, fmt.Errorf("storage invariants violated: %w", vErr) - } - - sockPath := hypervisor.SocketPath(rec.RunDir) - consoleSock := hypervisor.ConsoleSockPath(rec.RunDir) - - vmCfg := buildVMConfig(ctx, rec, consoleSock) - args := buildCLIArgs(vmCfg, sockPath) - ch.saveCmdline(ctx, rec, args) - - if _, err = ch.launchProcess(ctx, rec, sockPath, args, rec.ResolvedNetnsPath()); err != nil { - ch.MarkError(ctx, id) - return false, fmt.Errorf("launch VM: %w", err) - } - return true, nil + return ch.StartSequence(ctx, id, hypervisor.StartSpec{ + RuntimeFiles: runtimeFiles, + Launch: func(ctx context.Context, rec *hypervisor.VMRecord, sockPath string) (int, error) { + vmCfg := buildVMConfig(ctx, rec, hypervisor.ConsoleSockPath(rec.RunDir)) + args := buildCLIArgs(vmCfg, sockPath) + ch.saveCmdline(ctx, rec, args) + return ch.launchProcess(ctx, rec, sockPath, args, rec.ResolvedNetnsPath()) + }, + }) } func (ch *CloudHypervisor) launchProcess(ctx context.Context, rec *hypervisor.VMRecord, socketPath string, args []string, netnsPath string) (int, error) { diff --git a/hypervisor/cloudhypervisor/stop.go b/hypervisor/cloudhypervisor/stop.go index c779ed3e..894a1c66 100644 --- a/hypervisor/cloudhypervisor/stop.go +++ b/hypervisor/cloudhypervisor/stop.go @@ -18,23 +18,17 @@ func (ch *CloudHypervisor) Stop(ctx context.Context, refs []string) ([]string, e } func (ch *CloudHypervisor) stopOne(ctx context.Context, id string) error { - rec, err := ch.LoadRecord(ctx, id) - if err != nil { - return err - } - - sockPath := hypervisor.SocketPath(rec.RunDir) - hc := utils.NewSocketHTTPClient(sockPath) stopTimeout := time.Duration(ch.conf.StopTimeoutSeconds) * time.Second - - shutdownErr := ch.WithRunningVM(ctx, &rec, func(pid int) error { - if isDirectBoot(rec.BootConfig) || stopTimeout < 0 /* --force */ { - return ch.forceTerminate(ctx, hc, id, sockPath, pid) - } - return ch.shutdownUEFI(ctx, hc, id, sockPath, pid, stopTimeout) + return ch.StopOneSequence(ctx, id, hypervisor.StopSpec{ + RuntimeFiles: runtimeFiles, + Shutdown: func(ctx context.Context, rec *hypervisor.VMRecord, sockPath string, pid int) error { + hc := utils.NewSocketHTTPClient(sockPath) + if isDirectBoot(rec.BootConfig) || stopTimeout < 0 /* --force */ { + return ch.forceTerminate(ctx, hc, rec.ID, sockPath, pid) + } + return ch.shutdownUEFI(ctx, hc, rec.ID, sockPath, pid, stopTimeout) + }, }) - - return ch.HandleStopResult(ctx, id, rec.RunDir, runtimeFiles, shutdownErr) } // shutdownUEFI shuts down a UEFI-boot VM via ACPI power-button with poll-and-escalate handled by the shared GracefulStop helper. diff --git a/hypervisor/firecracker/firecracker.go b/hypervisor/firecracker/firecracker.go index 67449d03..933ff546 100644 --- a/hypervisor/firecracker/firecracker.go +++ b/hypervisor/firecracker/firecracker.go @@ -6,9 +6,7 @@ import ( "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/hypervisor" - "github.com/cocoonstack/cocoon/lock/flock" "github.com/cocoonstack/cocoon/metering" - storejson "github.com/cocoonstack/cocoon/storage/json" ) const typ = "firecracker" @@ -33,21 +31,11 @@ func New(conf *config.Config, rec metering.Recorder) (*Firecracker, error) { return nil, fmt.Errorf("config is nil") } cfg := NewConfig(conf) - if err := cfg.EnsureDirs(); err != nil { - return nil, fmt.Errorf("ensure dirs: %w", err) + backend, err := hypervisor.NewBackend(typ, cfg, rec) + if err != nil { + return nil, err } - locker := flock.New(cfg.IndexLock()) - store := storejson.New[hypervisor.VMIndex](cfg.IndexFile(), locker) - return &Firecracker{ - Backend: &hypervisor.Backend{ - Typ: typ, - Conf: cfg, - DB: store, - Locker: locker, - Metering: rec, - }, - conf: cfg, - }, nil + return &Firecracker{Backend: backend, conf: cfg}, nil } // Delete removes VMs. Running VMs require force=true (stops them first). diff --git a/hypervisor/firecracker/start.go b/hypervisor/firecracker/start.go index 426f76e9..a0d66dd2 100644 --- a/hypervisor/firecracker/start.go +++ b/hypervisor/firecracker/start.go @@ -24,32 +24,15 @@ func (fc *Firecracker) Start(ctx context.Context, refs []string) ([]string, erro } func (fc *Firecracker) startOne(ctx context.Context, id string) (bool, error) { - rec, err := fc.PrepareStart(ctx, id, runtimeFiles) - if err != nil { - return false, err - } - if rec == nil { - return false, nil // already running — no-op - } - if vErr := types.ValidateStorageConfigs(rec.StorageConfigs); vErr != nil { - fc.MarkError(ctx, id) - return false, fmt.Errorf("storage invariants violated: %w", vErr) - } - - sockPath := hypervisor.SocketPath(rec.RunDir) - - pid, err := fc.launchProcess(ctx, rec, sockPath, rec.ResolvedNetnsPath()) - if err != nil { - fc.MarkError(ctx, id) - return false, fmt.Errorf("launch VM: %w", err) - } - - if err := fc.configureVM(ctx, utils.NewSocketHTTPClient(sockPath), rec); err != nil { - fc.AbortLaunch(ctx, pid, sockPath, rec.RunDir, runtimeFiles) - fc.MarkError(ctx, id) - return false, fmt.Errorf("configure VM: %w", err) - } - return true, nil + return fc.StartSequence(ctx, id, hypervisor.StartSpec{ + RuntimeFiles: runtimeFiles, + Launch: func(ctx context.Context, rec *hypervisor.VMRecord, sockPath string) (int, error) { + return fc.launchProcess(ctx, rec, sockPath, rec.ResolvedNetnsPath()) + }, + PostLaunch: func(ctx context.Context, rec *hypervisor.VMRecord, sockPath string, _ int) error { + return fc.configureVM(ctx, utils.NewSocketHTTPClient(sockPath), rec) + }, + }) } // configureVM sends pre-boot config via REST then InstanceStart. diff --git a/hypervisor/firecracker/stop.go b/hypervisor/firecracker/stop.go index 6ebae140..9dc6b599 100644 --- a/hypervisor/firecracker/stop.go +++ b/hypervisor/firecracker/stop.go @@ -15,24 +15,16 @@ func (fc *Firecracker) Stop(ctx context.Context, refs []string) ([]string, error } func (fc *Firecracker) stopOne(ctx context.Context, id string) error { - rec, err := fc.LoadRecord(ctx, id) - if err != nil { - return err - } - - sockPath := hypervisor.SocketPath(rec.RunDir) - hc := utils.NewSocketHTTPClient(sockPath) stopTimeout := time.Duration(fc.conf.StopTimeoutSeconds) * time.Second - - shutdownErr := fc.WithRunningVM(ctx, &rec, func(pid int) error { - // --force (StopTimeoutSeconds < 0): skip SendCtrlAltDel, immediate kill. - if stopTimeout < 0 { - return fc.forceTerminate(ctx, sockPath, pid) - } - return fc.gracefulStop(ctx, hc, id, sockPath, pid, stopTimeout) + return fc.StopOneSequence(ctx, id, hypervisor.StopSpec{ + RuntimeFiles: runtimeFiles, + Shutdown: func(ctx context.Context, rec *hypervisor.VMRecord, sockPath string, pid int) error { + if stopTimeout < 0 { // --force + return fc.forceTerminate(ctx, sockPath, pid) + } + return fc.gracefulStop(ctx, utils.NewSocketHTTPClient(sockPath), rec.ID, sockPath, pid, stopTimeout) + }, }) - - return fc.HandleStopResult(ctx, id, rec.RunDir, runtimeFiles, shutdownErr) } // gracefulStop sends SendCtrlAltDel with poll-and-escalate handled by the shared GracefulStop helper. diff --git a/hypervisor/start.go b/hypervisor/start.go index f1476cc9..ee4a311a 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -9,11 +9,11 @@ import ( "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" ) -// StartAll runs startOne per ref; only ids that returned launched=true reach BatchMarkStarted, -// so already-running no-ops don't open duplicate intervals. +// StartAll runs startOne per ref; only ids that returned launched=true reach BatchMarkStarted, so already-running no-ops don't open duplicate intervals. func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) (bool, error)) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { @@ -42,6 +42,35 @@ func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(con return succeeded, forEachErr } +// StartSequence runs the shared start skeleton (PrepareStart → validate → Launch → optional PostLaunch with AbortLaunch rollback) and returns whether a fresh process was launched. +func (b *Backend) StartSequence(ctx context.Context, id string, spec StartSpec) (bool, error) { + rec, err := b.PrepareStart(ctx, id, spec.RuntimeFiles) + if err != nil { + return false, err + } + if rec == nil { + return false, nil + } + if vErr := types.ValidateStorageConfigs(rec.StorageConfigs); vErr != nil { + b.MarkError(ctx, id) + return false, fmt.Errorf("storage invariants violated: %w", vErr) + } + sockPath := SocketPath(rec.RunDir) + pid, err := spec.Launch(ctx, rec, sockPath) + if err != nil { + b.MarkError(ctx, id) + return false, fmt.Errorf("launch VM: %w", err) + } + if spec.PostLaunch != nil { + if err := spec.PostLaunch(ctx, rec, sockPath, pid); err != nil { + b.AbortLaunch(ctx, pid, sockPath, rec.RunDir, spec.RuntimeFiles) + b.MarkError(ctx, id) + return false, fmt.Errorf("configure VM: %w", err) + } + } + return true, nil +} + // PrepareStart loads the record, verifies not-running, ensures dirs exist. func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []string) (*VMRecord, error) { rec, err := b.LoadRecord(ctx, id) diff --git a/hypervisor/state.go b/hypervisor/state.go index 2b8be1bf..36f3788d 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -130,9 +130,7 @@ func (b *Backend) MarkError(ctx context.Context, id string) { } } -// BatchMarkStarted flips ids to VMStateRunning. Caller MUST pass only actually-launched ids; -// an id arriving here with DB State==Running is a stale-running record (process had crashed) — -// close that orphan with reason stop-crash before opening the fresh interval. +// BatchMarkStarted flips ids to VMStateRunning; caller MUST filter no-op already-running. An id arriving here with DB State==Running is treated as stale-running (process crashed) and gets a stop-crash close before the fresh start. func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { if len(ids) == 0 { return nil diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index f481af22..78395b5d 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -28,6 +28,8 @@ func (stubBackendConfig) SocketWaitTimeout() time.Duration { } func (stubBackendConfig) EffectivePoolSize() int { return 1 } func (stubBackendConfig) IndexFile() string { panic("IndexFile: not implemented in stub") } +func (stubBackendConfig) IndexLock() string { panic("IndexLock: not implemented in stub") } +func (stubBackendConfig) EnsureDirs() error { panic("EnsureDirs: not implemented in stub") } func (stubBackendConfig) RunDir() string { panic("RunDir: not implemented in stub") } func (stubBackendConfig) LogDir() string { panic("LogDir: not implemented in stub") } func (stubBackendConfig) VMRunDir(string) string { panic("VMRunDir: not implemented in stub") } diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 59c97c8e..2f0fa402 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -33,6 +33,19 @@ func (b *Backend) GracefulStop(ctx context.Context, vmID string, pid int, timeou return escalate() } +// StopOneSequence runs the shared per-id stop skeleton (LoadRecord → WithRunningVM(Shutdown) → HandleStopResult) so backends only express their force-vs-graceful choice. +func (b *Backend) StopOneSequence(ctx context.Context, id string, spec StopSpec) error { + rec, err := b.LoadRecord(ctx, id) + if err != nil { + return err + } + sockPath := SocketPath(rec.RunDir) + shutdownErr := b.WithRunningVM(ctx, &rec, func(pid int) error { + return spec.Shutdown(ctx, &rec, sockPath, pid) + }) + return b.HandleStopResult(ctx, id, rec.RunDir, spec.RuntimeFiles, shutdownErr) +} + // StopAll mirrors StartAll: stopOne per ref, batch-flip succeeded to Stopped. func (b *Backend) StopAll(ctx context.Context, refs []string, stopOne func(context.Context, string) error) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) diff --git a/metering/file.go b/metering/file.go index ed18da06..1622266d 100644 --- a/metering/file.go +++ b/metering/file.go @@ -9,8 +9,7 @@ import ( "github.com/projecteru2/core/log" ) -// FileRecorder appends JSON-encoded entries (one per line) to a file under sync.Mutex. -// POSIX guarantees single write(2) to O_APPEND is atomic across processes; the mutex serializes the in-process writes. +// FileRecorder appends JSON-encoded entries one per line; POSIX makes single write(2) to O_APPEND atomic across processes, so the mutex only serializes in-process writes. type FileRecorder struct { mu sync.Mutex f *os.File diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 663c2c58..3700f407 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -138,16 +138,9 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker } logEvictRow(ctx, logger, "collected", id, snap.records[id], snap.reasons[id]) removed = append(removed, id) - // Only emit stop for real records that had a corresponding start; - // orphan dirs and stale-pending IDs never opened a snap.storage interval. + // Skip orphan dirs and stale-pending — they never opened a snap.storage interval. if m, ok := snap.records[id]; ok { - meter.Emit(ctx, metering.Entry{ - Kind: metering.KindSnapStorageStop, - SnapshotID: id, - Reason: metering.ReasonSnapRemove, - Hypervisor: m.hypervisor, - EmittedAt: time.Now(), - }) + emitSnapStop(ctx, meter, id, m.hypervisor) } } if err := cleanResolvedRecords(store, removed); err != nil { diff --git a/snapshot/localfile/import.go b/snapshot/localfile/import.go index 1ce8a7bf..df3ebfd5 100644 --- a/snapshot/localfile/import.go +++ b/snapshot/localfile/import.go @@ -11,7 +11,6 @@ import ( "path/filepath" "time" - "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/snapshot" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -83,13 +82,7 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description return "", err } - lf.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindSnapStorageStart, - SnapshotID: id, - Hypervisor: cfg.Hypervisor, - Shape: metering.Shape{StorageBytes: size}, - EmittedAt: now, - }) + emitSnapStart(ctx, lf.meter(), id, cfg.Hypervisor, size, now) return id, nil } diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index e41b1a70..6d0ddf48 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -145,13 +145,7 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre return "", fmt.Errorf("finalize snapshot: %w", err) } - lf.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindSnapStorageStart, - SnapshotID: id, - Hypervisor: cfg.Hypervisor, - Shape: metering.Shape{StorageBytes: size}, - EmittedAt: finalizedAt, - }) + emitSnapStart(ctx, lf.meter(), id, cfg.Hypervisor, size, finalizedAt) return id, nil } @@ -225,13 +219,7 @@ func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { return fmt.Errorf("delete DB record %s: %w", id, err) } if deletedRecord { - lf.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindSnapStorageStop, - SnapshotID: id, - Reason: metering.ReasonSnapRemove, - Hypervisor: hypType, - EmittedAt: time.Now(), - }) + emitSnapStop(ctx, lf.meter(), id, hypType) } return nil } diff --git a/snapshot/localfile/metering.go b/snapshot/localfile/metering.go new file mode 100644 index 00000000..8f7d9004 --- /dev/null +++ b/snapshot/localfile/metering.go @@ -0,0 +1,22 @@ +package localfile + +import ( + "context" + "time" + + "github.com/cocoonstack/cocoon/metering" +) + +func emitSnapStart(ctx context.Context, rec metering.Recorder, snapID, hypType string, size int64, at time.Time) { + rec.Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStart, SnapshotID: snapID, Hypervisor: hypType, + Shape: metering.Shape{StorageBytes: size}, EmittedAt: at, + }) +} + +func emitSnapStop(ctx context.Context, rec metering.Recorder, snapID, hypType string) { + rec.Emit(ctx, metering.Entry{ + Kind: metering.KindSnapStorageStop, SnapshotID: snapID, + Reason: metering.ReasonSnapRemove, Hypervisor: hypType, EmittedAt: time.Now(), + }) +} From f1cdc5ec3fd2cfe4740347e83ee06040ea9a08fc Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 12:26:20 +0800 Subject: [PATCH 06/13] =?UTF-8?q?fix(metering):=20close=20compute=20interv?= =?UTF-8?q?al=20on=20Running=E2=86=92Error=20+=20restore-kill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - UpdateStates emits vm.compute.stop on Running→Error (stop-crash) so MarkError can't silently swallow an open interval; the downstream rm --force path won't double-fire because State=Error already closed. - emitRestoreComputeStop flips State→Stopped alongside the emit so a restore failure path (MarkError on Populate/AfterExtract error) sees oldState=Stopped and skips the re-emit. - Layout sweep enforcing SKILL.md (public-above-private, single const block, struct methods grouped, standalone utils after methods) — 7 files: hypervisor/{metering,restore,clone,utils}.go, snapshot/localfile/localfile.go, cmd/vm/{run,status}.go. --- cmd/vm/run.go | 30 ++++++++++---------- cmd/vm/status.go | 34 +++++++++++------------ hypervisor/clone.go | 6 ++-- hypervisor/metering.go | 20 ++++++-------- hypervisor/restore.go | 49 +++++++++++++++++++++------------ hypervisor/state.go | 8 ++++-- hypervisor/state_test.go | 49 ++++++++++++++++++++++++++------- hypervisor/utils.go | 2 -- metering/capture.go | 7 +++++ snapshot/localfile/localfile.go | 32 ++++++++++----------- 10 files changed, 143 insertions(+), 94 deletions(-) diff --git a/cmd/vm/run.go b/cmd/vm/run.go index 295956c9..947e1355 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -304,21 +304,6 @@ func (h Handler) cloneFromSrcDir(ctx context.Context, cmd *cobra.Command, conf * return nil } -// snapshotSource picks the clone/restore source: --from-dir or args[baseArgs]. Exactly one of (fromDir, snapRef) is non-empty. -func snapshotSource(cmd *cobra.Command, args []string, baseArgs int) (string, string, error) { - fromDir, _ := cmd.Flags().GetString("from-dir") - if fromDir != "" { - if len(args) > baseArgs { - return "", "", fmt.Errorf("--from-dir and positional SNAPSHOT are mutually exclusive") - } - return fromDir, "", nil - } - if len(args) <= baseArgs { - return "", "", fmt.Errorf("snapshot is required (or use --from-dir)") - } - return "", args[baseArgs], nil -} - func (h Handler) prepareClone(ctx context.Context, cmd *cobra.Command, conf *config.Config, cfg types.SnapshotConfig) (*types.VMConfig, string, network.Network, types.NetSetup, error) { vmCfg, err := cmdcore.CloneVMConfigFromFlags(cmd, cfg) if err != nil { @@ -451,6 +436,21 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t return ctx, info, hyper, nil } +// snapshotSource picks the clone/restore source: --from-dir or args[baseArgs]. Exactly one of (fromDir, snapRef) is non-empty. +func snapshotSource(cmd *cobra.Command, args []string, baseArgs int) (string, string, error) { + fromDir, _ := cmd.Flags().GetString("from-dir") + if fromDir != "" { + if len(args) > baseArgs { + return "", "", fmt.Errorf("--from-dir and positional SNAPSHOT are mutually exclusive") + } + return fromDir, "", nil + } + if len(args) <= baseArgs { + return "", "", fmt.Errorf("snapshot is required (or use --from-dir)") + } + return "", args[baseArgs], nil +} + // tapQueues: FC=1, CH=CPU count. func tapQueues(cpu int, useFC bool) int { if useFC { diff --git a/cmd/vm/status.go b/cmd/vm/status.go index fba47329..20a0276e 100644 --- a/cmd/vm/status.go +++ b/cmd/vm/status.go @@ -60,23 +60,6 @@ func (h Handler) List(cmd *cobra.Command, _ []string) error { return renderVMList(vms, format) } -// renderVMList emits vms as JSON or table; "No VMs found." for empty in table mode. -func renderVMList(vms []*types.VM, format string) error { - if format == "json" { - if vms == nil { - vms = []*types.VM{} - } - return cmdcore.OutputJSON(vms) - } - if len(vms) == 0 { - fmt.Println("No VMs found.") - return nil - } - w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) - printVMTable(w, vms) - return w.Flush() -} - func (h Handler) Status(cmd *cobra.Command, args []string) error { ctx, conf, err := h.Init(cmd) if err != nil { @@ -131,6 +114,23 @@ func statusOnce(ctx context.Context, hypers []hypervisor.Hypervisor, filters []s return renderVMList(vms, format) } +// renderVMList emits vms as JSON or table; "No VMs found." for empty in table mode. +func renderVMList(vms []*types.VM, format string) error { + if format == "json" { + if vms == nil { + vms = []*types.VM{} + } + return cmdcore.OutputJSON(vms) + } + if len(vms) == 0 { + fmt.Println("No VMs found.") + return nil + } + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + printVMTable(w, vms) + return w.Flush() +} + func mergeWatchChannels(ctx context.Context, hypers []hypervisor.Hypervisor) <-chan struct{} { var channels []<-chan struct{} for _, h := range hypers { diff --git a/hypervisor/clone.go b/hypervisor/clone.go index 289a637e..b4d2edf6 100644 --- a/hypervisor/clone.go +++ b/hypervisor/clone.go @@ -11,6 +11,9 @@ import ( "github.com/cocoonstack/cocoon/utils" ) +// AfterExtractFn finalizes a cloned VM after snapshot files are in place; sourceSnapshotID flows through for metering lineage. +type AfterExtractFn func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) + // CloneSetup is the shared pre-clone sequence: validate CPU, reserve a placeholder, ensure dirs, return a cleanup that rolls back both. func (b *Backend) CloneSetup(ctx context.Context, vmID string, vmCfg *types.VMConfig, snapshotConfig *types.SnapshotConfig) (runDir, logDir string, now time.Time, cleanup func(), err error) { if err = ValidateHostCPU(vmCfg.CPU); err != nil { @@ -35,9 +38,6 @@ func (b *Backend) CloneSetup(ctx context.Context, vmID string, vmCfg *types.VMCo return runDir, logDir, now, cleanup, nil } -// AfterExtractFn finalizes a cloned VM after snapshot files are in place; sourceSnapshotID flows through for metering lineage. -type AfterExtractFn func(ctx context.Context, vmID string, vmCfg *types.VMConfig, net types.NetSetup, runDir, logDir string, now time.Time, sourceSnapshotID string) (*types.VM, error) - // DirectCloneBase clones from a local snapshot directory. Used when the snapshot lives on the same host (no tar streaming needed). func (b *Backend) DirectCloneBase( ctx context.Context, vmID string, vmCfg *types.VMConfig, diff --git a/hypervisor/metering.go b/hypervisor/metering.go index e5904b24..96ec8b72 100644 --- a/hypervisor/metering.go +++ b/hypervisor/metering.go @@ -12,15 +12,6 @@ func (b *Backend) meter() metering.Recorder { return metering.OrNop(b.Metering) } -func shapeFromConfig(c types.VMConfig) metering.Shape { - return metering.Shape{ - CPU: c.CPU, - MemBytes: c.Memory, - StorageBytes: c.Storage, - } -} - -// makeEntry builds a VM-scoped entry stamped with this backend's Hypervisor type; emit sites that don't need SourceSnapshotID use it to skip the field-by-field boilerplate. func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Reason, shape metering.Shape, now time.Time) metering.Entry { return metering.Entry{ Kind: kind, VMID: vmID, Reason: reason, @@ -28,7 +19,6 @@ func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Rea } } -// emitAll fans out a batch of entries through one Recorder lookup. func (b *Backend) emitAll(ctx context.Context, entries []metering.Entry) { rec := b.meter() for _, e := range entries { @@ -48,7 +38,7 @@ func (b *Backend) emitOpenInterval(ctx context.Context, vm *types.VM, reason met } } -// emitDeleteClose emits storage.stop unconditionally and compute.stop when the record had an open Running interval. +// emitDeleteClose emits storage.stop unconditionally and compute.stop only when the record had an open Running interval. func (b *Backend) emitDeleteClose(ctx context.Context, vmID string, shape metering.Shape, computeReason metering.Reason, hadRunningInterval bool) { now := time.Now() rec := b.meter() @@ -63,3 +53,11 @@ func (b *Backend) emitDeleteClose(ctx context.Context, vmID string, shape meteri Hypervisor: b.Typ, Shape: shape, EmittedAt: now, }) } + +func shapeFromConfig(c types.VMConfig) metering.Shape { + return metering.Shape{ + CPU: c.CPU, + MemBytes: c.Memory, + StorageBytes: c.Storage, + } +} diff --git a/hypervisor/restore.go b/hypervisor/restore.go index 20914f29..4439c5c4 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -8,6 +8,8 @@ import ( "os" "time" + "github.com/projecteru2/core/log" + "github.com/cocoonstack/cocoon/metering" "github.com/cocoonstack/cocoon/types" "github.com/cocoonstack/cocoon/utils" @@ -67,24 +69,6 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types return &info, nil } -// emitRestoreComputeStop closes only the compute interval at the kill boundary; storage stays open so a restore failure leaves on-disk files intact for vm rm to close later. -func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldShape metering.Shape, sourceSnapshotID string) { - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: time.Now(), - }) -} - -// emitRestoreSuccess closes the old storage interval and opens fresh storage+compute; called only after restore fully succeeds. -func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { - now := time.Now() - b.meter().Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, - }) - b.emitOpenInterval(ctx, vm, metering.ReasonRestore, sourceSnapshotID, now) -} - // RestoreSequence is the shared restore skeleton. Staging happens before the kill so a preflight failure leaves the original VM running. func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec RestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { @@ -176,6 +160,35 @@ func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec return result, nil } +// emitRestoreComputeStop closes the compute interval and flips State→Stopped so a later MarkError won't re-emit; storage stays open until vm rm. +func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldShape metering.Shape, sourceSnapshotID string) { + now := time.Now() + if err := b.DB.Update(ctx, func(idx *VMIndex) error { + if r := idx.VMs[vmID]; r != nil { + r.State = types.VMStateStopped + r.StoppedAt = &now + r.UpdatedAt = now + } + return nil + }); err != nil { + log.WithFunc(b.Typ+".emitRestoreComputeStop").Warnf(ctx, "mark stopped after kill %s: %v", vmID, err) + } + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, + }) +} + +// emitRestoreSuccess closes the old storage interval and opens fresh storage+compute; called only after restore fully succeeds. +func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { + now := time.Now() + b.meter().Emit(ctx, metering.Entry{ + Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, + Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, + }) + b.emitOpenInterval(ctx, vm, metering.ReasonRestore, sourceSnapshotID, now) +} + // PrepareStagingDir extracts the snapshot tar into a sibling staging dir. func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cleanup func(), err error) { stagingDir = runDir + ".restore-staging" diff --git a/hypervisor/state.go b/hypervisor/state.go index 36f3788d..2c092f60 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -88,7 +88,7 @@ func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume }) } -// UpdateStates batch-updates State + StartedAt/StoppedAt; emits metering vm.compute.stop only on Running→Stopped. +// UpdateStates batch-updates State + StartedAt/StoppedAt; emits vm.compute.stop on Running→{Stopped,Error}. func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VMState) error { if len(ids) == 0 { return nil @@ -109,10 +109,14 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM r.StartedAt = &now case types.VMStateStopped: r.StoppedAt = &now - // Only Running→Stopped closes a real interval; idempotent stops would emit a phantom. if oldState == types.VMStateRunning { stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopUser, shapeFromConfig(r.Config), now)) } + case types.VMStateError: + r.StoppedAt = &now + if oldState == types.VMStateRunning { + stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shapeFromConfig(r.Config), now)) + } } } return nil diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 78395b5d..767ad757 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -108,12 +108,11 @@ func TestBatchMarkStartedReasonRestartWhenAlreadyBooted(t *testing.T) { } } -func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { +func TestUpdateStatesEmitsOnRunningToStoppedOrError(t *testing.T) { b, cap := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) - // Created→Stopped: no Running interval to close, must not emit. if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped from created): %v", err) } @@ -121,7 +120,6 @@ func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { t.Errorf("Created→Stopped emitted %d; want 0 (no Running interval to close)", len(got)) } - // Stopped→Running: not a stop, must not emit. if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { t.Fatalf("UpdateStates(running): %v", err) } @@ -129,16 +127,14 @@ func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { t.Errorf("Stopped→Running emitted %d; want 0", len(got)) } - // Running→Stopped: this is the only path that closes a Running interval. if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped): %v", err) } entries := cap.Entries() - if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop { - t.Fatalf("got %+v, want one compute.stop", entries) + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopUser { + t.Fatalf("Running→Stopped: got %+v, want one compute.stop reason=user", entries) } - // Stopped→Stopped: idempotent, must not duplicate the event. if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped idempotent): %v", err) } @@ -146,15 +142,29 @@ func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { t.Errorf("Stopped→Stopped should not re-emit; got %d entries total", len(got)) } - // Set Running again, then go through Error (not Stopped). Error must not emit. if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { t.Fatalf("UpdateStates(running again): %v", err) } if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { t.Fatalf("UpdateStates(error): %v", err) } - if got := cap.Entries(); len(got) != 1 { - t.Errorf("Error state must not emit; got %d entries total", len(got)) + entries = cap.Entries() + if len(entries) != 2 || entries[1].Kind != metering.KindVMComputeStop || entries[1].Reason != metering.ReasonStopCrash { + t.Fatalf("Running→Error: got %+v, want compute.stop reason=stop-crash as 2nd entry", entries) + } + + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { + t.Fatalf("UpdateStates(error idempotent): %v", err) + } + if got := cap.Entries(); len(got) != 2 { + t.Errorf("Error→Error must not re-emit; got %d entries total", len(got)) + } + seedVMRecord(t, b, "vm2", 1, 1<<30, 10<<30, false) + if err := b.UpdateStates(ctx, []string{"vm2"}, types.VMStateError); err != nil { + t.Fatalf("UpdateStates(vm2 error from created): %v", err) + } + if got := cap.Entries(); len(got) != 2 { + t.Errorf("Created→Error must not emit; got %d entries total", len(got)) } } @@ -373,6 +383,25 @@ func TestFinalizeCreateEmitsStorageStart(t *testing.T) { } } +func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, true) + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { + t.Fatalf("UpdateStates(running): %v", err) + } + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { + t.Fatalf("UpdateStates(error): %v", err) + } + cap.Reset() + + b.emitDeleteClose(ctx, "vm1", metering.Shape{CPU: 2, MemBytes: 2 << 30, StorageBytes: 20 << 30}, metering.ReasonStopCrash, false) + entries := cap.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMStorageStop { + t.Fatalf("post-Error delete: got %+v, want one storage.stop", entries) + } +} + func TestEmitNilSafeWithoutRecorder(t *testing.T) { // b.Metering left nil — meter() must return NopRecorder so emit doesn't panic. dir := t.TempDir() diff --git a/hypervisor/utils.go b/hypervisor/utils.go index a716b54f..b3ba50c0 100644 --- a/hypervisor/utils.go +++ b/hypervisor/utils.go @@ -34,9 +34,7 @@ const ( SnapshotFileMeta // SnapshotFileSkip means the file should not be cloned. SnapshotFileSkip -) -const ( // MinDataDiskSize is the minimum user data disk size; mkfs.ext4 is unstable below this on small sparse files. MinDataDiskSize int64 = 16 << 20 diff --git a/metering/capture.go b/metering/capture.go index dce92b4d..0a06b614 100644 --- a/metering/capture.go +++ b/metering/capture.go @@ -25,3 +25,10 @@ func (r *CaptureRecorder) Entries() []Entry { copy(out, r.entries) return out } + +// Reset drops buffered entries. +func (r *CaptureRecorder) Reset() { + r.mu.Lock() + defer r.mu.Unlock() + r.entries = nil +} diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 6d0ddf48..8ef6e859 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -66,10 +66,6 @@ func New(conf *config.Config, rec metering.Recorder, opts ...Option) (*LocalFile return lf, nil } -func (lf *LocalFile) meter() metering.Recorder { - return metering.OrNop(lf.metering) -} - func (lf *LocalFile) Type() string { return typ } // DataDir returns the local data directory and snapshot config for direct file access. @@ -194,6 +190,22 @@ func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error return deleted, nil } +func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotConfig, io.ReadCloser, error) { + rec, err := lf.lookupRecord(ctx, ref, true) + if err != nil { + return types.SnapshotConfig{}, nil, err + } + return snapshotRecordToConfig(rec), utils.TarDirStream(rec.DataDir, nil), nil +} + +func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { + gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy, lf.metering)) +} + +func (lf *LocalFile) meter() metering.Recorder { + return metering.OrNop(lf.metering) +} + // deleteOne removes one snapshot atomically; idempotent under concurrent rm — if the rival wins the DB race we report success (data is gone) but skip emit, so the ledger holds exactly one stop event per snapshot. func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { @@ -224,18 +236,6 @@ func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { return nil } -func (lf *LocalFile) Restore(ctx context.Context, ref string) (types.SnapshotConfig, io.ReadCloser, error) { - rec, err := lf.lookupRecord(ctx, ref, true) - if err != nil { - return types.SnapshotConfig{}, nil, err - } - return snapshotRecordToConfig(rec), utils.TarDirStream(rec.DataDir, nil), nil -} - -func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { - gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy, lf.metering)) -} - // rollbackCreate removes a placeholder snapshot record from the DB. func (lf *LocalFile) rollbackCreate(ctx context.Context, id, name string) { if err := lf.store.Update(ctx, func(idx *snapshot.SnapshotIndex) error { From a86286beb12f0ebfd3d235cf9e9e52a34e76c8ed Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 12:45:42 +0800 Subject: [PATCH 07/13] fix(metering): set StoppedAt on stale-relaunch + drop OrNop indirection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - BatchMarkStarted now sets r.StoppedAt = &now when closing a stale-running interval so DB introspection matches the ledger. - NewBackend / localfile.New default rec→NopRecorder when nil, so hot emit paths can call b.Metering.Emit / lf.metering.Emit directly without going through the OrNop helper. OrNop is removed. - Compress 10 verbose godoc/inline comments introduced this branch. --- hypervisor/backend.go | 5 ++++- hypervisor/clone.go | 2 +- hypervisor/create.go | 5 ++--- hypervisor/metering.go | 15 +++++--------- hypervisor/restore.go | 6 +++--- hypervisor/start.go | 2 +- hypervisor/state.go | 3 ++- hypervisor/state_test.go | 36 ++++++++++++++++++++++----------- hypervisor/stop.go | 1 - metering/file.go | 2 +- metering/metering.go | 8 -------- snapshot/localfile/gc.go | 3 +-- snapshot/localfile/import.go | 2 +- snapshot/localfile/localfile.go | 11 +++++----- 14 files changed, 50 insertions(+), 51 deletions(-) diff --git a/hypervisor/backend.go b/hypervisor/backend.go index c90fabfc..d022f208 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -70,11 +70,14 @@ type Backend struct { Metering metering.Recorder } -// NewBackend wires the shared init boilerplate: EnsureDirs, flock, JSON store. Each backend's New just builds its Config and forwards it here. +// NewBackend wires shared init: EnsureDirs, flock, JSON store, nil-recorder fallback. func NewBackend(typ string, conf BackendConfig, rec metering.Recorder) (*Backend, error) { if err := conf.EnsureDirs(); err != nil { return nil, fmt.Errorf("ensure dirs: %w", err) } + if rec == nil { + rec = metering.NopRecorder{} + } locker := flock.New(conf.IndexLock()) return &Backend{ Typ: typ, diff --git a/hypervisor/clone.go b/hypervisor/clone.go index b4d2edf6..0354b186 100644 --- a/hypervisor/clone.go +++ b/hypervisor/clone.go @@ -85,7 +85,7 @@ func (b *Backend) CloneFromStream( return afterExtract(ctx, vmID, vmCfg, net, runDir, logDir, now, snapshotConfig.ID) } -// FinalizeClone persists the cloned VM record and emits the open-interval pair (storage.start + compute.start, reason=clone). +// FinalizeClone persists the record and emits the clone open-interval pair. func (b *Backend) FinalizeClone(ctx context.Context, vmID string, info *types.VM, bootCfg *types.BootConfig, blobIDs map[string]struct{}, sourceSnapshotID string) error { if err := b.DB.Update(ctx, func(idx *VMIndex) error { r, err := idx.GetRecord(vmID) diff --git a/hypervisor/create.go b/hypervisor/create.go index 2ecdba71..92b99879 100644 --- a/hypervisor/create.go +++ b/hypervisor/create.go @@ -36,7 +36,6 @@ func (b *Backend) ReserveVM(ctx context.Context, id string, vmCfg *types.VMConfi }) } -// RollbackCreate removes a placeholder VM record from the DB. func (b *Backend) RollbackCreate(ctx context.Context, id, name string) { if err := b.DB.Update(ctx, func(idx *VMIndex) error { delete(idx.VMs, id) @@ -67,11 +66,11 @@ func (b *Backend) FinalizeCreate(ctx context.Context, id string, info *types.VM, }); err != nil { return err } - b.meter().Emit(ctx, b.makeEntry(metering.KindVMStorageStart, id, metering.ReasonBoot, shapeFromConfig(info.Config), time.Now())) + b.Metering.Emit(ctx, b.makeEntry(metering.KindVMStorageStart, id, metering.ReasonBoot, shapeFromConfig(info.Config), time.Now())) return nil } -// CreateSequence is the shared placeholder→finalize create skeleton; a mid-flight crash rolls back DB + rundir so GC has nothing to reconcile. +// CreateSequence is the shared placeholder→finalize create skeleton. func (b *Backend) CreateSequence(ctx context.Context, id string, spec CreateSpec) (_ *types.VM, err error) { if err = ValidateHostCPU(spec.VMCfg.CPU); err != nil { return nil, err diff --git a/hypervisor/metering.go b/hypervisor/metering.go index 96ec8b72..80a32171 100644 --- a/hypervisor/metering.go +++ b/hypervisor/metering.go @@ -8,10 +8,6 @@ import ( "github.com/cocoonstack/cocoon/types" ) -func (b *Backend) meter() metering.Recorder { - return metering.OrNop(b.Metering) -} - func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Reason, shape metering.Shape, now time.Time) metering.Entry { return metering.Entry{ Kind: kind, VMID: vmID, Reason: reason, @@ -20,15 +16,14 @@ func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Rea } func (b *Backend) emitAll(ctx context.Context, entries []metering.Entry) { - rec := b.meter() for _, e := range entries { - rec.Emit(ctx, e) + b.Metering.Emit(ctx, e) } } -// emitOpenInterval emits the storage.start + compute.start pair that opens a fresh interval for cloned or restored VMs; the caller's now keeps the timestamp consistent with adjacent close events. +// emitOpenInterval fires the storage.start + compute.start pair; caller-provided now keeps adjacent stop/start timestamps aligned. func (b *Backend) emitOpenInterval(ctx context.Context, vm *types.VM, reason metering.Reason, sourceSnapshotID string, now time.Time) { - rec := b.meter() + rec := b.Metering shape := shapeFromConfig(vm.Config) for _, kind := range []metering.Kind{metering.KindVMStorageStart, metering.KindVMComputeStart} { rec.Emit(ctx, metering.Entry{ @@ -38,10 +33,10 @@ func (b *Backend) emitOpenInterval(ctx context.Context, vm *types.VM, reason met } } -// emitDeleteClose emits storage.stop unconditionally and compute.stop only when the record had an open Running interval. +// emitDeleteClose fires storage.stop unconditionally; compute.stop only when an interval was open. func (b *Backend) emitDeleteClose(ctx context.Context, vmID string, shape metering.Shape, computeReason metering.Reason, hadRunningInterval bool) { now := time.Now() - rec := b.meter() + rec := b.Metering if hadRunningInterval { rec.Emit(ctx, metering.Entry{ Kind: metering.KindVMComputeStop, VMID: vmID, Reason: computeReason, diff --git a/hypervisor/restore.go b/hypervisor/restore.go index 4439c5c4..cdd9e378 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -42,7 +42,7 @@ func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, return vmID, &rec, nil } -// FinalizeRestore updates DB and assembles the returned VM; metering lives in (Direct)RestoreSequence so the close fires at the kill boundary, not only on full success. +// FinalizeRestore updates DB and assembles the returned VM. func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord, pid int) (*types.VM, error) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { @@ -173,7 +173,7 @@ func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldSh }); err != nil { log.WithFunc(b.Typ+".emitRestoreComputeStop").Warnf(ctx, "mark stopped after kill %s: %v", vmID, err) } - b.meter().Emit(ctx, metering.Entry{ + b.Metering.Emit(ctx, metering.Entry{ Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, }) @@ -182,7 +182,7 @@ func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldSh // emitRestoreSuccess closes the old storage interval and opens fresh storage+compute; called only after restore fully succeeds. func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { now := time.Now() - b.meter().Emit(ctx, metering.Entry{ + b.Metering.Emit(ctx, metering.Entry{ Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, }) diff --git a/hypervisor/start.go b/hypervisor/start.go index ee4a311a..3ed0aee5 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -42,7 +42,7 @@ func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(con return succeeded, forEachErr } -// StartSequence runs the shared start skeleton (PrepareStart → validate → Launch → optional PostLaunch with AbortLaunch rollback) and returns whether a fresh process was launched. +// StartSequence runs the shared start skeleton; returns whether a fresh process was launched. func (b *Backend) StartSequence(ctx context.Context, id string, spec StartSpec) (bool, error) { rec, err := b.PrepareStart(ctx, id, spec.RuntimeFiles) if err != nil { diff --git a/hypervisor/state.go b/hypervisor/state.go index 2c092f60..06b5ccdf 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -134,7 +134,7 @@ func (b *Backend) MarkError(ctx context.Context, id string) { } } -// BatchMarkStarted flips ids to VMStateRunning; caller MUST filter no-op already-running. An id arriving here with DB State==Running is treated as stale-running (process crashed) and gets a stop-crash close before the fresh start. +// BatchMarkStarted flips ids to VMStateRunning; State==Running entrants are stale-running (close stop-crash, then open fresh). func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { if len(ids) == 0 { return nil @@ -150,6 +150,7 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { shape := shapeFromConfig(r.Config) if r.State == types.VMStateRunning { emits = append(emits, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shape, now)) + r.StoppedAt = &now } reason := metering.ReasonBoot if r.FirstBooted { diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 767ad757..89b18b30 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -13,9 +13,20 @@ import ( "github.com/cocoonstack/cocoon/types" ) +func newDiskStubConfig(t *testing.T) stubBackendConfig { + dir := t.TempDir() + return stubBackendConfig{ + indexFile: filepath.Join(dir, "index.json"), + indexLock: filepath.Join(dir, "index.lock"), + } +} + // stubBackendConfig satisfies BackendConfig for tests that only exercise the // metering wiring; unused methods panic so accidental dependence shows up loud. -type stubBackendConfig struct{} +type stubBackendConfig struct { + indexFile string + indexLock string +} func (stubBackendConfig) BinaryName() string { panic("BinaryName: not implemented in stub") } func (stubBackendConfig) PIDFileName() string { panic("PIDFileName: not implemented in stub") } @@ -27,9 +38,9 @@ func (stubBackendConfig) SocketWaitTimeout() time.Duration { panic("SocketWaitTimeout: not implemented in stub") } func (stubBackendConfig) EffectivePoolSize() int { return 1 } -func (stubBackendConfig) IndexFile() string { panic("IndexFile: not implemented in stub") } -func (stubBackendConfig) IndexLock() string { panic("IndexLock: not implemented in stub") } -func (stubBackendConfig) EnsureDirs() error { panic("EnsureDirs: not implemented in stub") } +func (c stubBackendConfig) IndexFile() string { return c.indexFile } +func (c stubBackendConfig) IndexLock() string { return c.indexLock } +func (stubBackendConfig) EnsureDirs() error { return nil } func (stubBackendConfig) RunDir() string { panic("RunDir: not implemented in stub") } func (stubBackendConfig) LogDir() string { panic("LogDir: not implemented in stub") } func (stubBackendConfig) VMRunDir(string) string { panic("VMRunDir: not implemented in stub") } @@ -402,16 +413,17 @@ func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) { } } -func TestEmitNilSafeWithoutRecorder(t *testing.T) { - // b.Metering left nil — meter() must return NopRecorder so emit doesn't panic. - dir := t.TempDir() - locker := flock.New(filepath.Join(dir, "index.lock")) - store := storejson.New[VMIndex](filepath.Join(dir, "index.json"), locker) - b := &Backend{Typ: "test-hv", DB: store, Locker: locker} - +func TestNewBackendNilRecorderDefaultsToNop(t *testing.T) { + b, err := NewBackend("test-hv", newDiskStubConfig(t), nil) + if err != nil { + t.Fatalf("NewBackend(rec=nil): %v", err) + } + if _, ok := b.Metering.(metering.NopRecorder); !ok { + t.Fatalf("nil recorder should default to NopRecorder, got %T", b.Metering) + } ctx := t.Context() seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, false) if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { - t.Errorf("BatchMarkStarted with nil Metering panicked/failed: %v", err) + t.Errorf("BatchMarkStarted with NopRecorder: %v", err) } } diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 2f0fa402..7f3db7d8 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -71,7 +71,6 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop return loadErr } sockPath := SocketPath(rec.RunDir) - // stoppedByUs distinguishes user-stop (ReasonStopUser) from orphan-crash cleanup (ReasonStopCrash). stoppedByUs := false if runningErr := b.WithRunningVM(ctx, &rec, func(_ int) error { if !force { diff --git a/metering/file.go b/metering/file.go index 1622266d..a6b06bd1 100644 --- a/metering/file.go +++ b/metering/file.go @@ -15,7 +15,7 @@ type FileRecorder struct { f *os.File } -// NewFileRecorder opens path append-only; on open failure logs a warning and returns NopRecorder so callers never see nil. +// NewFileRecorder opens path append-only; returns NopRecorder on failure. func NewFileRecorder(ctx context.Context, path string) Recorder { f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600) //nolint:gosec // internal runtime path if err != nil { diff --git a/metering/metering.go b/metering/metering.go index 69135bd8..489fd83c 100644 --- a/metering/metering.go +++ b/metering/metering.go @@ -61,11 +61,3 @@ type Recorder interface { type NopRecorder struct{} func (NopRecorder) Emit(context.Context, Entry) {} - -// OrNop returns r unchanged when non-nil, NopRecorder otherwise so emit sites never have to nil-check. -func OrNop(r Recorder) Recorder { - if r == nil { - return NopRecorder{} - } - return r -} diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 3700f407..54acc6d1 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -122,7 +122,6 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker }, Collect: func(ctx context.Context, ids []string, snap snapshotGCSnapshot) error { logger := log.WithFunc("gc.snapshot") - meter := metering.OrNop(recorder) var ( errs []error removed = make([]string, 0, len(ids)) @@ -140,7 +139,7 @@ func gcModule(conf *Config, store storage.Store[snapshot.SnapshotIndex], locker removed = append(removed, id) // Skip orphan dirs and stale-pending — they never opened a snap.storage interval. if m, ok := snap.records[id]; ok { - emitSnapStop(ctx, meter, id, m.hypervisor) + emitSnapStop(ctx, recorder, id, m.hypervisor) } } if err := cleanResolvedRecords(store, removed); err != nil { diff --git a/snapshot/localfile/import.go b/snapshot/localfile/import.go index df3ebfd5..8513bbf3 100644 --- a/snapshot/localfile/import.go +++ b/snapshot/localfile/import.go @@ -82,7 +82,7 @@ func (lf *LocalFile) Import(ctx context.Context, r io.Reader, name, description return "", err } - emitSnapStart(ctx, lf.meter(), id, cfg.Hypervisor, size, now) + emitSnapStart(ctx, lf.metering, id, cfg.Hypervisor, size, now) return id, nil } diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 8ef6e859..697fadc3 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -57,6 +57,9 @@ func New(conf *config.Config, rec metering.Recorder, opts ...Option) (*LocalFile if err := cfg.EnsureDirs(); err != nil { return nil, fmt.Errorf("ensure dirs: %w", err) } + if rec == nil { + rec = metering.NopRecorder{} + } locker := flock.New(cfg.IndexLock()) store := storejson.New[snapshot.SnapshotIndex](cfg.IndexFile(), locker) lf := &LocalFile{conf: cfg, store: store, locker: locker, metering: rec} @@ -141,7 +144,7 @@ func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stre return "", fmt.Errorf("finalize snapshot: %w", err) } - emitSnapStart(ctx, lf.meter(), id, cfg.Hypervisor, size, finalizedAt) + emitSnapStart(ctx, lf.metering, id, cfg.Hypervisor, size, finalizedAt) return id, nil } @@ -202,10 +205,6 @@ func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy, lf.metering)) } -func (lf *LocalFile) meter() metering.Recorder { - return metering.OrNop(lf.metering) -} - // deleteOne removes one snapshot atomically; idempotent under concurrent rm — if the rival wins the DB race we report success (data is gone) but skip emit, so the ledger holds exactly one stop event per snapshot. func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { @@ -231,7 +230,7 @@ func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { return fmt.Errorf("delete DB record %s: %w", id, err) } if deletedRecord { - emitSnapStop(ctx, lf.meter(), id, hypType) + emitSnapStop(ctx, lf.metering, id, hypType) } return nil } From 711f3a436efb9217f12a369a508a9d9972511f77 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 12:53:27 +0800 Subject: [PATCH 08/13] refactor: compress verbose godoc/inline introduced this branch drops or shortens ~20 godoc/inline comments that restated identifiers or carried design narration that belongs in the PR body, including: hypervisor backend Spec types, restore.go helpers, create.go sequences, metering package doc, FileRecorder/Emit, snapshot/localfile Create/Delete/deleteOne/gc helpers. --- hypervisor/backend.go | 23 +++++++++++------------ hypervisor/restore.go | 10 +++------- hypervisor/start.go | 4 ++-- hypervisor/stop.go | 2 +- metering/file.go | 4 ++-- metering/metering.go | 4 +--- snapshot/localfile/gc.go | 2 +- snapshot/localfile/localfile.go | 6 +++--- 8 files changed, 24 insertions(+), 31 deletions(-) diff --git a/hypervisor/backend.go b/hypervisor/backend.go index d022f208..c91d809c 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -88,29 +88,28 @@ func NewBackend(typ string, conf BackendConfig, rec metering.Recorder) (*Backend }, nil } -// LaunchSpec is the per-call input to Backend.LaunchVMProcess. Shared -// BinaryName / SocketWaitTimeout come from BackendConfig. +// LaunchSpec is the per-call input to Backend.LaunchVMProcess. type LaunchSpec struct { Cmd *exec.Cmd PIDPath string SockPath string - NetnsPath string // empty = host netns - OnFail func() // optional cleanup on any error path + NetnsPath string + OnFail func() } -// RestoreSpec carries the backend-specific hooks for Backend.RestoreSequence. +// RestoreSpec carries backend hooks for Backend.RestoreSequence. type RestoreSpec struct { VMCfg *types.VMConfig Snapshot io.Reader - SourceSnapshotID string // for metering lineage; emitted on the restore close+open events + SourceSnapshotID string Preflight func(stagingDir string, rec *VMRecord) error Kill func(ctx context.Context, vmID string, rec *VMRecord) error - Wrap func(rec *VMRecord, fn func() error) error // optional disk lock around merge+afterExtract - BeforeMerge func(rec *VMRecord) error // e.g. FC removes stale COW + Wrap func(rec *VMRecord, fn func() error) error + BeforeMerge func(rec *VMRecord) error AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } -// DirectRestoreSpec is RestoreSpec for a local srcDir rather than a tar; Populate replaces staging+merge. +// DirectRestoreSpec is RestoreSpec for a local srcDir; Populate replaces staging+merge. type DirectRestoreSpec struct { VMCfg *types.VMConfig SrcDir string @@ -122,20 +121,20 @@ type DirectRestoreSpec struct { AfterExtract func(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord) (*types.VM, error) } -// StartSpec carries StartSequence inputs: Launch builds + exec the VMM, PostLaunch is optional config-via-REST that the backend rolls back via AbortLaunch on failure. +// StartSpec carries StartSequence inputs. type StartSpec struct { RuntimeFiles []string Launch func(ctx context.Context, rec *VMRecord, sockPath string) (int, error) PostLaunch func(ctx context.Context, rec *VMRecord, sockPath string, pid int) error } -// StopSpec carries StopOneSequence inputs: Shutdown picks the backend-specific path (force vs graceful) once WithRunningVM confirms the process is live. +// StopSpec carries StopOneSequence inputs. type StopSpec struct { RuntimeFiles []string Shutdown func(ctx context.Context, rec *VMRecord, sockPath string, pid int) error } -// CreateSpec carries CreateSequence inputs; Prepare returns final storage configs (COW + data disks). +// CreateSpec carries CreateSequence inputs. type CreateSpec struct { VMCfg *types.VMConfig StorageConfigs []*types.StorageConfig diff --git a/hypervisor/restore.go b/hypervisor/restore.go index cdd9e378..6c463a63 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -15,7 +15,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// KillForRestore stops the running VM via the backend-specific terminate hook and clears runtime files. func (b *Backend) KillForRestore(ctx context.Context, vmID string, rec *VMRecord, terminate func(pid int) error, runtimeFiles []string) error { killErr := b.WithRunningVM(ctx, rec, terminate) if killErr != nil && !errors.Is(killErr, ErrNotRunning) { @@ -26,7 +25,6 @@ func (b *Backend) KillForRestore(ctx context.Context, vmID string, rec *VMRecord return nil } -// ResolveForRestore resolves vmRef and validates the VM is running. func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, *VMRecord, error) { vmID, err := b.ResolveRef(ctx, vmRef) if err != nil { @@ -42,7 +40,6 @@ func (b *Backend) ResolveForRestore(ctx context.Context, vmRef string) (string, return vmID, &rec, nil } -// FinalizeRestore updates DB and assembles the returned VM. func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types.VMConfig, rec *VMRecord, pid int) (*types.VM, error) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { @@ -69,7 +66,7 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types return &info, nil } -// RestoreSequence is the shared restore skeleton. Staging happens before the kill so a preflight failure leaves the original VM running. +// RestoreSequence is the shared restore skeleton (preflight before kill). func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec RestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { return nil, err @@ -120,7 +117,7 @@ func (b *Backend) RestoreSequence(ctx context.Context, vmRef string, spec Restor return result, nil } -// DirectRestoreSequence restores from a local snapshot directory; Populate replaces the tar staging+merge step used by RestoreSequence. +// DirectRestoreSequence restores from a local snapshot directory. func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec DirectRestoreSpec) (*types.VM, error) { if err := ValidateHostCPU(spec.VMCfg.CPU); err != nil { return nil, err @@ -179,7 +176,7 @@ func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldSh }) } -// emitRestoreSuccess closes the old storage interval and opens fresh storage+compute; called only after restore fully succeeds. +// emitRestoreSuccess closes old storage and opens fresh storage+compute. func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { now := time.Now() b.Metering.Emit(ctx, metering.Entry{ @@ -189,7 +186,6 @@ func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape b.emitOpenInterval(ctx, vm, metering.ReasonRestore, sourceSnapshotID, now) } -// PrepareStagingDir extracts the snapshot tar into a sibling staging dir. func PrepareStagingDir(runDir string, snapshot io.Reader) (stagingDir string, cleanup func(), err error) { stagingDir = runDir + ".restore-staging" if err = os.RemoveAll(stagingDir); err != nil { diff --git a/hypervisor/start.go b/hypervisor/start.go index 3ed0aee5..ec9fa61e 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -13,7 +13,7 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// StartAll runs startOne per ref; only ids that returned launched=true reach BatchMarkStarted, so already-running no-ops don't open duplicate intervals. +// StartAll runs startOne per ref; only launched=true ids reach BatchMarkStarted. func (b *Backend) StartAll(ctx context.Context, refs []string, startOne func(context.Context, string) (bool, error)) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { @@ -95,7 +95,7 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st return &rec, nil } -// LaunchVMProcess starts spec.Cmd and waits for the API socket; any post-Start error kills the process + removes the PID file. Caller reaps via cmd.Wait(). +// LaunchVMProcess starts spec.Cmd and waits for the API socket; any post-Start error kills the process + removes the PID file. func (b *Backend) LaunchVMProcess(ctx context.Context, spec LaunchSpec) (pid int, err error) { started := false pidWritten := false diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 7f3db7d8..9a4b5411 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -59,7 +59,7 @@ func (b *Backend) StopAll(ctx context.Context, refs []string, stopOne func(conte return succeeded, forEachErr } -// DeleteAll removes VMs by ref; dir cleanup runs before DB delete so a failed cleanup leaves a retry-able record (vs an orphan rundir with no index entry). +// DeleteAll removes VMs by ref; dir cleanup before DB delete keeps a failed cleanup retry-able. func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stopOne func(context.Context, string) error) ([]string, error) { ids, err := b.ResolveRefs(ctx, refs) if err != nil { diff --git a/metering/file.go b/metering/file.go index a6b06bd1..028cf3e9 100644 --- a/metering/file.go +++ b/metering/file.go @@ -9,7 +9,7 @@ import ( "github.com/projecteru2/core/log" ) -// FileRecorder appends JSON-encoded entries one per line; POSIX makes single write(2) to O_APPEND atomic across processes, so the mutex only serializes in-process writes. +// FileRecorder appends JSON-encoded entries one per line under sync.Mutex; cross-process atomicity comes from O_APPEND. type FileRecorder struct { mu sync.Mutex f *os.File @@ -25,7 +25,7 @@ func NewFileRecorder(ctx context.Context, path string) Recorder { return &FileRecorder{f: f} } -// Emit logs and swallows write errors so the caller's state machine is never blocked. +// Emit logs and swallows write errors to never block callers. func (r *FileRecorder) Emit(ctx context.Context, e Entry) { data, err := json.Marshal(e) if err != nil { diff --git a/metering/metering.go b/metering/metering.go index 489fd83c..23612d75 100644 --- a/metering/metering.go +++ b/metering/metering.go @@ -1,6 +1,4 @@ -// Package metering emits append-only VM/snapshot lifecycle endpoints -// for downstream usage/billing pipelines. cocoon emits raw signals only; -// tenant attribution and pricing are layered above. +// Package metering emits append-only VM/snapshot lifecycle endpoints; tenant attribution lives upstream. package metering import ( diff --git a/snapshot/localfile/gc.go b/snapshot/localfile/gc.go index 54acc6d1..55af97c3 100644 --- a/snapshot/localfile/gc.go +++ b/snapshot/localfile/gc.go @@ -224,7 +224,7 @@ func logEvictRow(ctx context.Context, logger *log.Fields, verb, id string, m sna verb, id, m.name, m.sizeBytes, accessed, reason) } -// backfillSizeBytes fills in sizeBytes for any record whose SizeBytes wasn't persisted, then writes the resolved values back so future GC runs can skip the du. +// backfillSizeBytes computes + persists SizeBytes for records missing it so future GC skips du. func backfillSizeBytes(ctx context.Context, conf *Config, store storage.Store[snapshot.SnapshotIndex], records map[string]snapshotMeta) { logger := log.WithFunc("gc.snapshot") var changed bool diff --git a/snapshot/localfile/localfile.go b/snapshot/localfile/localfile.go index 697fadc3..8c93ffa5 100644 --- a/snapshot/localfile/localfile.go +++ b/snapshot/localfile/localfile.go @@ -80,7 +80,7 @@ func (lf *LocalFile) DataDir(ctx context.Context, ref string) (string, types.Sna return rec.DataDir, snapshotRecordToConfig(rec), nil } -// Create stores a snapshot from stream via placeholder→extract→finalize so a mid-flight crash leaves only a pending record for GC; emits metering snap.storage.start on success. +// Create stores a snapshot via placeholder→extract→finalize; a mid-flight crash leaves a pending record for GC. func (lf *LocalFile) Create(ctx context.Context, cfg *types.SnapshotConfig, stream io.Reader) (_ string, err error) { id := cfg.ID if id == "" { @@ -172,7 +172,7 @@ func (lf *LocalFile) Inspect(ctx context.Context, ref string) (*types.Snapshot, return &s, nil } -// Delete removes each ref atomically (rm dir → DB update) and emits snap.storage.stop per deleted id; a mid-loop rm-OK-then-DB-fail leaves a stale DB record that GC reclaims. +// Delete removes each ref (rm dir → DB update); a mid-loop rm-OK-then-DB-fail leaves a stale DB record for GC. func (lf *LocalFile) Delete(ctx context.Context, refs []string) ([]string, error) { var ids []string if err := lf.store.With(ctx, func(idx *snapshot.SnapshotIndex) error { @@ -205,7 +205,7 @@ func (lf *LocalFile) RegisterGC(orch *gc.Orchestrator) { gc.Register(orch, gcModule(lf.conf, lf.store, lf.locker, lf.gcPolicy, lf.metering)) } -// deleteOne removes one snapshot atomically; idempotent under concurrent rm — if the rival wins the DB race we report success (data is gone) but skip emit, so the ledger holds exactly one stop event per snapshot. +// deleteOne is idempotent under concurrent rm; the rival's emit is skipped so the ledger keeps exactly one stop per snapshot. func (lf *LocalFile) deleteOne(ctx context.Context, id string) error { if err := os.RemoveAll(lf.conf.SnapshotDataDir(id)); err != nil { return fmt.Errorf("remove data dir %s: %w", id, err) From 03645058080bcb2e5a88cef68b9356a4714cc878 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 12:59:10 +0800 Subject: [PATCH 09/13] refactor: trim verbose comments in pre-existing cmd/* + hypervisor/{ch,fc} drops or compresses ~30 godoc/inline comments that restated identifiers or carried design narration: cmd/core/helpers.go, cmd/snapshot/handler.go, cmd/vm/{lifecycle,run,status}.go, hypervisor/cloudhypervisor/{clone,stop}.go, hypervisor/firecracker/clone.go. Removes TODO(inspect) marker (track in issue tracker instead). --- cmd/core/helpers.go | 24 ++++-------------------- cmd/snapshot/handler.go | 8 +------- cmd/vm/lifecycle.go | 17 +++-------------- cmd/vm/run.go | 12 +++--------- cmd/vm/status.go | 8 +++----- hypervisor/cloudhypervisor/clone.go | 4 ++-- hypervisor/cloudhypervisor/stop.go | 1 - hypervisor/firecracker/clone.go | 10 ++++------ 8 files changed, 20 insertions(+), 64 deletions(-) diff --git a/cmd/core/helpers.go b/cmd/core/helpers.go index a3baee15..ee8108d3 100644 --- a/cmd/core/helpers.go +++ b/cmd/core/helpers.go @@ -41,13 +41,11 @@ var hypervisorFactories = []hypervisorFactory{ }}, } -// hypervisorFactory keeps backend lookup and iteration order together. type hypervisorFactory struct { typ config.HypervisorType ctor func(context.Context, *config.Config) (hypervisor.Hypervisor, error) } -// BaseHandler provides shared config access for all command handlers. type BaseHandler struct { ConfProvider func() *config.Config } @@ -75,7 +73,6 @@ func (h BaseHandler) Conf() (*config.Config, error) { return conf, nil } -// CommandContext returns cmd.Context() or Background (test-only fallback). func CommandContext(cmd *cobra.Command) context.Context { if cmd != nil && cmd.Context() != nil { return cmd.Context() @@ -160,7 +157,7 @@ func ListAllVMs(ctx context.Context, hypers []hypervisor.Hypervisor) ([]*types.V return all, nil } -// RouteRefs resolves user refs to (hypervisor → full VM IDs); downstream callers never re-resolve. +// RouteRefs resolves user refs to (hypervisor → full VM IDs). func RouteRefs(ctx context.Context, hypers []hypervisor.Hypervisor, refs []string) (map[hypervisor.Hypervisor][]string, error) { result := map[hypervisor.Hypervisor][]string{} for _, ref := range refs { @@ -338,7 +335,6 @@ func VMConfigFromFlags(cmd *cobra.Command, image string) (*types.VMConfig, error return cfg, nil } -// CloneVMConfigFromFlags builds VMConfig for clone (inherits from snapshot). func CloneVMConfigFromFlags(cmd *cobra.Command, snapCfg types.SnapshotConfig) (*types.VMConfig, error) { vmName, _ := cmd.Flags().GetString("name") flagNetwork, _ := cmd.Flags().GetString("network") @@ -354,7 +350,6 @@ func CloneVMConfigFromFlags(cmd *cobra.Command, snapCfg types.SnapshotConfig) (* onDemand, _ := cmd.Flags().GetBool("on-demand") - // Validate runs in prepareClone, after the default name is filled in. return &types.VMConfig{ Name: vmName, Config: types.Config{ @@ -389,7 +384,6 @@ func RestoreVMConfigFromFlags(cmd *cobra.Command, vm *types.VM, snapCfg types.Sn Name: vm.Config.Name, OnDemand: onDemand, } - // Guard against tampered --from-dir --force envelopes. if err := result.Validate(); err != nil { return nil, fmt.Errorf("snapshot config: %w", err) } @@ -402,7 +396,6 @@ func EnsureFirmwarePath(conf *config.Config, bootCfg *types.BootConfig) { } } -// ReconcileState detects stale running records via process liveness. func ReconcileState(vm *types.VM) string { if vm.State == types.VMStateRunning && !utils.IsProcessAlive(vm.PID) { return "stopped (stale)" @@ -420,18 +413,16 @@ func AddFormatFlag(cmd *cobra.Command) { cmd.Flags().StringP("format", "o", "table", `output format: "table" or "json"`) } -// AddOutputFlag adds --output/-o for lifecycle commands. Empty default keeps the human-readable log output; "json" emits a parseable result on stdout. func AddOutputFlag(cmd *cobra.Command) { cmd.Flags().StringP("output", "o", "", `emit "json" for machine-readable output`) } -// WantJSON reports whether --output=json was requested. func WantJSON(cmd *cobra.Command) bool { out, _ := cmd.Flags().GetString("output") return out == "json" } -// MaybeOutputJSON emits JSON iff --output=json; (true, _) means handled and the caller should stop logging. +// MaybeOutputJSON emits JSON iff --output=json; (true, _) means caller should stop logging. func MaybeOutputJSON(cmd *cobra.Command, v any) (bool, error) { if !WantJSON(cmd) { return false, nil @@ -439,7 +430,6 @@ func MaybeOutputJSON(cmd *cobra.Command, v any) (bool, error) { return true, OutputJSON(v) } -// OutputFormatted outputs as JSON or table based on --format flag. func OutputFormatted(cmd *cobra.Command, data any, tableFn func(w *tabwriter.Writer)) error { format, _ := cmd.Flags().GetString("format") if format == "json" { @@ -542,11 +532,9 @@ func resolveVMOwner(ctx context.Context, hypers []hypervisor.Hypervisor, ref str return owner, resolved, err } -// sanitizeVMName derives a safe VM name from an image reference. func sanitizeVMName(image string) string { ref, err := name.ParseReference(image) if err != nil { - // Unparseable — fall back to simple replace. n := strings.ReplaceAll(image, "/", "-") n = strings.ReplaceAll(n, ":", "-") n = "cocoon-" + n @@ -556,14 +544,10 @@ func sanitizeVMName(image string) string { return n } - // RepositoryStr() strips the registry hostname. - // Docker Hub official images get "library/" prepended — strip it. - repo := ref.Context().RepositoryStr() - repo = strings.TrimPrefix(repo, "library/") - + repo := strings.TrimPrefix(ref.Context().RepositoryStr(), "library/") n := "cocoon-" + strings.ReplaceAll(repo, "/", "-") - // Append tag (but not digest — it's too long and not human-readable). + // Skip digest (too long); use tag if not latest. if tag, ok := ref.(name.Tag); ok && tag.TagStr() != "latest" { n += "-" + tag.TagStr() } diff --git a/cmd/snapshot/handler.go b/cmd/snapshot/handler.go index 56537513..8844b95d 100644 --- a/cmd/snapshot/handler.go +++ b/cmd/snapshot/handler.go @@ -42,7 +42,6 @@ func (h Handler) Save(cmd *cobra.Command, args []string) error { name, _ := cmd.Flags().GetString("name") description, _ := cmd.Flags().GetString("description") - // Pre-check: reject if the snapshot name is already taken. if name != "" { if _, inspectErr := snapBackend.Inspect(ctx, name); inspectErr == nil { return fmt.Errorf("snapshot name %q already exists", name) @@ -59,8 +58,7 @@ func (h Handler) Save(cmd *cobra.Command, args []string) error { } defer stream.Close() //nolint:errcheck - // Close stream on context cancellation to unblock the pipe immediately, - // so Ctrl+C doesn't hang while streaming large snapshot data. + // Close stream on ctx cancel so Ctrl+C doesn't hang on the pipe. stop := context.AfterFunc(ctx, func() { stream.Close() //nolint:errcheck,gosec }) @@ -90,7 +88,6 @@ func (h Handler) List(cmd *cobra.Command, _ []string) error { return err } - // Optional: filter by VM ownership. vmRef, _ := cmd.Flags().GetString("vm") var filterIDs map[string]struct{} if vmRef != "" { @@ -114,7 +111,6 @@ func (h Handler) List(cmd *cobra.Command, _ []string) error { return fmt.Errorf("list: %w", err) } - // Apply VM filter if specified. if filterIDs != nil { filtered := snapshots[:0] for _, s := range snapshots { @@ -209,7 +205,6 @@ func (h Handler) Export(cmd *cobra.Command, args []string) (err error) { }) defer stop() - // Stream to stdout when output is "-". if output == "-" { if _, err = io.Copy(os.Stdout, stream); err != nil { return fmt.Errorf("write archive: %w", err) @@ -217,7 +212,6 @@ func (h Handler) Export(cmd *cobra.Command, args []string) (err error) { return nil } - // Derive default output filename from snapshot name or ID. if output == "" { snap, inspectErr := snapBackend.Inspect(ctx, ref) if inspectErr != nil { diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index 2eddc7b2..14b2d65d 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -27,20 +27,15 @@ import ( const ( // logHeadSigLen spans CH/FC's boot timestamp on line 1. logHeadSigLen = 64 - // logFollowDebounce coalesces fsnotify events on the log file before - // the catch-up io.Copy fires. + // logFollowDebounce coalesces fsnotify events before catch-up io.Copy fires. logFollowDebounce = 100 * time.Millisecond ) -// attachedDevices is the inspect-only view of runtime hot-plugged devices. -// Cocoon never persists this structure; it is read from CH vm.info on demand. type attachedDevices struct { Fs []fs.Attached `json:"fs,omitempty"` Devices []vfio.Attached `json:"devices,omitempty"` } -// inspectOutput wraps types.VM with an extra runtime field. Defined in the -// CLI layer to keep types.VM free of inspect-only fields. type inspectOutput struct { *types.VM AttachedDevices *attachedDevices `json:"attached_devices,omitempty"` @@ -61,7 +56,6 @@ func (h Handler) Start(cmd *cobra.Command, args []string) error { return err } - // Recover network for all backends before starting. for hyper, refs := range routed { h.recoverNetwork(ctx, conf, hyper, refs) } @@ -239,7 +233,6 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { return fmt.Errorf("vm(s) deleted but network cleanup failed: %w", delErr) } } - // Also clean up bridge TAPs (no-op if none exist). bridgenet.CleanupTAPs(allDeleted) } @@ -258,17 +251,15 @@ func (h Handler) RM(cmd *cobra.Command, args []string) error { func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper hypervisor.Hypervisor, refs []string) { logger := log.WithFunc("cmd.vm.recoverNetwork") - // Lazy-init CNI provider (may fail if not configured — OK for bridge-only setups). + // Lazy CNI; OK to skip for bridge-only setups. var cniProvider network.Network if p, err := cmdcore.InitNetwork(conf); err == nil { cniProvider = p } - // Cache bridge providers by device name to avoid redundant netlink lookups. bridgeProviders := map[string]network.Network{} // Single List → byID map avoids one Inspect-per-ref under DB lock. - // Refs are pre-resolved full IDs (RouteRefs returns vm.ID), so byID is sufficient. all, err := hyper.List(ctx) if err != nil { logger.Warnf(ctx, "list VMs for recovery: %v", err) @@ -288,7 +279,6 @@ func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper if backend == "" { continue } - // Bridge 0-NIC: no TAP, no netns — nothing to recover. if backend == types.BackendBridge && len(vm.NetworkConfigs) == 0 { continue } @@ -371,7 +361,6 @@ func batchRoutedCmd(ctx context.Context, cmd *cobra.Command, name, pastTense str } // collectAttachedDevices reads fs/vfio devices; errors are logged and dropped so inspect tolerates a flaky vm.info. -// TODO(inspect): each Lister calls vm.info separately; combine via extend/ Lister to halve the round-trips. func collectAttachedDevices(ctx context.Context, hyper hypervisor.Hypervisor, ref string) *attachedDevices { logger := log.WithFunc("cmd.vm.inspect") out := &attachedDevices{} @@ -434,7 +423,7 @@ func streamLog(ctx context.Context, path string, follow bool, tail int) error { if !ok { return nil } - // Stop/start re-opens O_TRUNC; head bytes shift because CH/FC stamp a unique boot timestamp on line 1, so sig mismatch catches new generations even at the same length. + // Sig mismatch catches O_TRUNC re-opens (CH/FC stamp a unique boot timestamp on line 1). newSig, _ := utils.FileHead(f, logHeadSigLen) if !bytes.Equal(newSig, sig) { if _, err := f.Seek(0, io.SeekStart); err != nil { diff --git a/cmd/vm/run.go b/cmd/vm/run.go index 947e1355..c35dad82 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -53,7 +53,6 @@ func (h Handler) Run(cmd *cobra.Command, args []string) error { return fmt.Errorf("start VM %s: %w", vm.ID, err) } if wantJSON { - // Re-inspect for post-start state; on err, fall back to pre-start vm so JSON isn't silently stale. info, inspectErr := hyper.Inspect(ctx, vm.ID) switch { case inspectErr != nil: @@ -89,7 +88,6 @@ func (h Handler) Clone(cmd *cobra.Command, args []string) error { return err } - // Infer hypervisor backend from the snapshot's Hypervisor field. snapInfo, err := snapBackend.Inspect(ctx, snapRef) if err != nil { return fmt.Errorf("inspect snapshot %s: %w", snapRef, err) @@ -261,8 +259,7 @@ func (h Handler) cloneFromDir(ctx context.Context, cmd *cobra.Command, conf *con if err != nil { return fmt.Errorf("load envelope: %w", err) } - // Local copy so flipping the backend selection doesn't leak to the caller's - // shared *config.Config (CLI is fine, daemons embedding cocoon would notice). + // Local copy keeps backend flip from leaking to the caller's shared *config.Config. localConf := *conf if cfg.Hypervisor != "" { localConf.UseFirecracker = cfg.Hypervisor == string(config.HypervisorFirecracker) @@ -317,7 +314,6 @@ func (h Handler) prepareClone(ctx context.Context, cmd *cobra.Command, conf *con return nil, "", nil, types.NetSetup{}, err } - // Auto-pull base image if --pull is set (cross-node clone). if pull, _ := cmd.Flags().GetBool("pull"); pull && vmCfg.Image != "" && vmCfg.ImageType != "" { backends, initErr := cmdcore.InitImageBackends(ctx, conf) if initErr != nil { @@ -391,7 +387,6 @@ func (h Handler) createVM(cmd *cobra.Command, image string) (context.Context, *t return nil, nil, nil, err } - // Validate backend/boot-mode constraints before initializing backends. if conf.UseFirecracker && vmCfg.Windows { return nil, nil, nil, fmt.Errorf("--fc and --windows are mutually exclusive: Firecracker does not support Windows guests") } @@ -484,7 +479,7 @@ func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int if nics <= 0 { return netProvider, setup, nil } - // Override CPU for TAP queue count (FC=1, CH=per-vCPU); network reads vmCfg.CPU. + // FC needs 1 TAP queue, CH needs per-vCPU; network reads vmCfg.CPU. origCPU := vmCfg.CPU vmCfg.CPU = queues configs, err := netProvider.Add(ctx, vmID, vmCfg, network.AddRange(0, nics)...) @@ -529,8 +524,7 @@ func printPostCloneHints(vm *types.VM) { fmt.Println(" # Release memory for balloon") fmt.Println(" echo 3 > /proc/sys/vm/drop_caches") - // FC clone: guest MAC is baked in vmstate (source VM's MAC). - // Must change guest MAC before networkd config takes effect. + // FC clone: guest MAC is baked in vmstate; change it before networkd config. if vm.Hypervisor == string(config.HypervisorFirecracker) { printFCMACHints(vm.NetworkConfigs) } diff --git a/cmd/vm/status.go b/cmd/vm/status.go index 20a0276e..0dadf1c2 100644 --- a/cmd/vm/status.go +++ b/cmd/vm/status.go @@ -21,7 +21,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// statusWatchDebounce coalesces fsnotify events on the per-backend index file during `vm status` polling. const statusWatchDebounce = 200 * time.Millisecond type vmEvent struct { @@ -103,7 +102,7 @@ func (h Handler) Status(cmd *cobra.Command, args []string) error { return nil } -// statusOnce prints a single snapshot then returns; propagates ListAllVMs error (loop callers swallow). +// statusOnce prints one snapshot; propagates ListAllVMs error (loop callers swallow). func statusOnce(ctx context.Context, hypers []hypervisor.Hypervisor, filters []string, format string) error { vms, err := cmdcore.ListAllVMs(ctx, hypers) if err != nil { @@ -114,7 +113,6 @@ func statusOnce(ctx context.Context, hypers []hypervisor.Hypervisor, filters []s return renderVMList(vms, format) } -// renderVMList emits vms as JSON or table; "No VMs found." for empty in table mode. func renderVMList(vms []*types.VM, format string) error { if format == "json" { if vms == nil { @@ -233,7 +231,7 @@ func statusEventLoopJSON(ctx context.Context, hypers []hypervisor.Hypervisor, fi }) } -// statusEventDiffLoop snapshots every tick, diffs vs previous, emits ADDED/MODIFIED/DELETED. Carries both snap and vm so emitters pick either. +// statusEventDiffLoop snapshots every tick, diffs vs previous, emits ADDED/MODIFIED/DELETED. func statusEventDiffLoop(ctx context.Context, hypers []hypervisor.Hypervisor, filters []string, watchCh <-chan struct{}, tick <-chan time.Time, emitter eventEmitter) { type entry struct { snap vmSnapshot @@ -292,7 +290,7 @@ func printEventRow(w *tabwriter.Writer, event string, snap vmSnapshot) { snap.ip, snap.image) } -// listAndFilter swallows backend errors with a warn so a transient hiccup can't break the polling tick; one-shot callers must use cmdcore.ListAllVMs directly. +// listAndFilter warns on backend errors so polling ticks don't break; one-shot callers use cmdcore.ListAllVMs directly. func listAndFilter(ctx context.Context, hypers []hypervisor.Hypervisor, filters []string) []*types.VM { vms, err := cmdcore.ListAllVMs(ctx, hypers) if err != nil { diff --git a/hypervisor/cloudhypervisor/clone.go b/hypervisor/cloudhypervisor/clone.go index 1f8a544f..ff807fe9 100644 --- a/hypervisor/cloudhypervisor/clone.go +++ b/hypervisor/cloudhypervisor/clone.go @@ -239,7 +239,7 @@ func hasCidataRole(sc *types.StorageConfig) bool { return sc.Role == types.StorageRoleCidata } -// restorePatchStorageConfigs strips ensureCloneCidata's appended cidata when the snapshot lacked one, so patchCHConfig matches chCfg.Disks; cidata gets hot-plugged. +// restorePatchStorageConfigs drops the appended cidata when the snapshot lacked one (cidata gets hot-plugged). func restorePatchStorageConfigs(storageConfigs []*types.StorageConfig, directBoot, windows, hadCidataInSnapshot bool) []*types.StorageConfig { if directBoot || windows || hadCidataInSnapshot { return storageConfigs @@ -285,7 +285,7 @@ func buildCmdline(storageConfigs []*types.StorageConfig, networkConfigs []*types ) } -// buildStateReplacements maps source disk paths → clone paths for state.json patching; slices to min length so an appended cidata doesn't desync (MACs go via NIC hot-swap). +// buildStateReplacements maps source→clone disk paths for state.json; min-length slice keeps appended cidata aligned. func buildStateReplacements(chCfg *chVMConfig, storageConfigs []*types.StorageConfig) map[string]string { n := min(len(chCfg.Disks), len(storageConfigs)) m := make(map[string]string, n) diff --git a/hypervisor/cloudhypervisor/stop.go b/hypervisor/cloudhypervisor/stop.go index 894a1c66..d9c906a9 100644 --- a/hypervisor/cloudhypervisor/stop.go +++ b/hypervisor/cloudhypervisor/stop.go @@ -47,7 +47,6 @@ func (ch *CloudHypervisor) forceTerminate(ctx context.Context, hc *http.Client, return utils.TerminateProcess(ctx, pid, ch.conf.BinaryName(), socketPath, ch.conf.TerminateGracePeriod()) } -// isDirectBoot returns true when the VM was started with a direct kernel boot (OCI images). False means UEFI boot (cloudimg). func isDirectBoot(boot *types.BootConfig) bool { return boot != nil && boot.KernelPath != "" } diff --git a/hypervisor/firecracker/clone.go b/hypervisor/firecracker/clone.go index 9ceb4aa9..ca7b4055 100644 --- a/hypervisor/firecracker/clone.go +++ b/hypervisor/firecracker/clone.go @@ -70,7 +70,7 @@ func (fc *Firecracker) cloneAfterExtract(ctx context.Context, vmID string, vmCfg bootCfg.Cmdline = buildCmdline(storageConfigs, networkConfigs, vmCfg.Name, dns) } - // FC's snapshot/load wants source-absolute drive paths; symlink-redirect the source COW until upstream supports drive overrides. + // FC snapshot/load wants source-absolute drive paths; symlink-redirect the source COW. sockPath := hypervisor.SocketPath(runDir) var pid int if cloneErr := withSourceWritableDisksLocked(meta.StorageConfigs, func() error { @@ -123,7 +123,7 @@ func (fc *Firecracker) restoreAndResumeClone( } }() - // network_overrides repoints FC at the clone's TAP (FC recreates from vmstate); vsock_override retargets the snapshot UDS. + // network_overrides repoints FC at the clone's TAP; vsock_override retargets the snapshot UDS. netOverrides := buildNetworkOverrides(networkConfigs) if err = loadSnapshotFC(ctx, sockPath, runDir, netOverrides, hypervisor.VsockSockPath(runDir)); err != nil { return fmt.Errorf("snapshot/load: %w", err) @@ -135,7 +135,7 @@ func (fc *Firecracker) restoreAndResumeClone( return nil } -// rebuildCloneStorage rewrites paths per role: Layer→source (shared), COW→cowPath, Data→clone runDir; cidata rejected (FC has no cloudimg). +// rebuildCloneStorage rewrites paths per role (Layer→source, COW→cowPath, Data→runDir); cidata rejected. func rebuildCloneStorage(meta *hypervisor.SnapshotMeta, cowPath string) ([]*types.StorageConfig, error) { runDir := filepath.Dir(cowPath) configs := hypervisor.CloneStorageConfigs(meta.StorageConfigs) @@ -155,7 +155,6 @@ func rebuildCloneStorage(meta *hypervisor.SnapshotMeta, cowPath string) ([]*type return configs, nil } -// createDriveRedirects symlinks source COW → clone COW so FC snapshot/load finds the drive at the expected source path. func createDriveRedirects(srcConfigs, dstConfigs []*types.StorageConfig) ([]driveRedirect, error) { var redirects []driveRedirect for i, src := range srcConfigs { @@ -205,8 +204,7 @@ func cleanupDriveRedirects(redirects []driveRedirect) { } } -// recoverStaleBackup restores a backup file left by a crashed clone. -// Caller must hold the COW lock. +// recoverStaleBackup restores a crashed-clone backup; caller must hold the COW lock. func recoverStaleBackup(cowPath string) { backup := cowPath + cloneBackupSuffix if _, err := os.Stat(backup); err != nil { From 83a5efac71111f06b7ed0a0268a8b0cf05d88fef Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 13:19:22 +0800 Subject: [PATCH 10/13] fix(metering): emit compute.stop only from confirmed-dead paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UpdateStates(Error) no longer emits. Many MarkError callers (Shutdown errors, ctx-cancel, AbortLaunch best-effort) can't prove the process is dead, so closing the interval there risked a still-alive VM with no compute account. PrepareStart now closes the stale interval (DB Running + process not alive) via a new closeStaleComputeInterval helper — the only point at start time where we can prove the old VMM is gone. BatchMarkStarted's stale-close branch stays as a safety net. --- hypervisor/start.go | 4 +- hypervisor/state.go | 24 +++++++++-- hypervisor/state_test.go | 87 ++++++++++++++++++++++------------------ 3 files changed, 72 insertions(+), 43 deletions(-) diff --git a/hypervisor/start.go b/hypervisor/start.go index ec9fa61e..11ac697b 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -83,7 +83,9 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st case runErr == nil: return nil, nil // already running case errors.Is(runErr, ErrNotRunning): - // expected — proceed to start + if rec.State == types.VMStateRunning { + b.closeStaleComputeInterval(ctx, &rec) + } default: return nil, fmt.Errorf("reconcile running VM %s: %w", id, runErr) } diff --git a/hypervisor/state.go b/hypervisor/state.go index 06b5ccdf..47eb2335 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -88,7 +88,7 @@ func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume }) } -// UpdateStates batch-updates State + StartedAt/StoppedAt; emits vm.compute.stop on Running→{Stopped,Error}. +// UpdateStates batch-updates State + StartedAt/StoppedAt; only Running→Stopped emits compute.stop (Error paths can't prove the process is dead). func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VMState) error { if len(ids) == 0 { return nil @@ -114,9 +114,6 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM } case types.VMStateError: r.StoppedAt = &now - if oldState == types.VMStateRunning { - stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shapeFromConfig(r.Config), now)) - } } } return nil @@ -186,3 +183,22 @@ func (b *Backend) CleanStalePlaceholders(_ context.Context, ids []string) error return nil }) } + +// closeStaleComputeInterval flips Running→Stopped and emits stop-crash; precondition: caller confirmed the process is dead. +func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord) { + now := time.Now() + if err := b.DB.Update(ctx, func(idx *VMIndex) error { + r := idx.VMs[rec.ID] + if r == nil || r.State != types.VMStateRunning { + return nil + } + r.State = types.VMStateStopped + r.StoppedAt = &now + r.UpdatedAt = now + return nil + }); err != nil { + log.WithFunc(b.Typ+".closeStaleComputeInterval").Warnf(ctx, "flip %s to stopped: %v", rec.ID, err) + return + } + b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now)) +} diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 89b18b30..70ec6702 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -28,23 +28,18 @@ type stubBackendConfig struct { indexLock string } -func (stubBackendConfig) BinaryName() string { panic("BinaryName: not implemented in stub") } -func (stubBackendConfig) PIDFileName() string { panic("PIDFileName: not implemented in stub") } -func (stubBackendConfig) TerminateGracePeriod() time.Duration { - panic("TerminateGracePeriod: not implemented in stub") -} - -func (stubBackendConfig) SocketWaitTimeout() time.Duration { - panic("SocketWaitTimeout: not implemented in stub") -} -func (stubBackendConfig) EffectivePoolSize() int { return 1 } -func (c stubBackendConfig) IndexFile() string { return c.indexFile } -func (c stubBackendConfig) IndexLock() string { return c.indexLock } -func (stubBackendConfig) EnsureDirs() error { return nil } -func (stubBackendConfig) RunDir() string { panic("RunDir: not implemented in stub") } -func (stubBackendConfig) LogDir() string { panic("LogDir: not implemented in stub") } -func (stubBackendConfig) VMRunDir(string) string { panic("VMRunDir: not implemented in stub") } -func (stubBackendConfig) VMLogDir(string) string { panic("VMLogDir: not implemented in stub") } +func (stubBackendConfig) BinaryName() string { return "stub-vmm" } +func (stubBackendConfig) PIDFileName() string { return "stub.pid" } +func (stubBackendConfig) TerminateGracePeriod() time.Duration { return time.Second } +func (stubBackendConfig) SocketWaitTimeout() time.Duration { return time.Second } +func (stubBackendConfig) EffectivePoolSize() int { return 1 } +func (c stubBackendConfig) IndexFile() string { return c.indexFile } +func (c stubBackendConfig) IndexLock() string { return c.indexLock } +func (stubBackendConfig) EnsureDirs() error { return nil } +func (stubBackendConfig) RunDir() string { panic("RunDir: not implemented in stub") } +func (stubBackendConfig) LogDir() string { panic("LogDir: not implemented in stub") } +func (stubBackendConfig) VMRunDir(string) string { panic("VMRunDir: not implemented in stub") } +func (stubBackendConfig) VMLogDir(string) string { panic("VMLogDir: not implemented in stub") } func newMeteringTestBackend(t *testing.T) (*Backend, *metering.CaptureRecorder) { t.Helper() @@ -119,7 +114,7 @@ func TestBatchMarkStartedReasonRestartWhenAlreadyBooted(t *testing.T) { } } -func TestUpdateStatesEmitsOnRunningToStoppedOrError(t *testing.T) { +func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { b, cap := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) @@ -128,7 +123,7 @@ func TestUpdateStatesEmitsOnRunningToStoppedOrError(t *testing.T) { t.Fatalf("UpdateStates(stopped from created): %v", err) } if got := cap.Entries(); len(got) != 0 { - t.Errorf("Created→Stopped emitted %d; want 0 (no Running interval to close)", len(got)) + t.Errorf("Created→Stopped emitted %d; want 0", len(got)) } if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { @@ -146,36 +141,52 @@ func TestUpdateStatesEmitsOnRunningToStoppedOrError(t *testing.T) { t.Fatalf("Running→Stopped: got %+v, want one compute.stop reason=user", entries) } - if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { - t.Fatalf("UpdateStates(stopped idempotent): %v", err) - } - if got := cap.Entries(); len(got) != 1 { - t.Errorf("Stopped→Stopped should not re-emit; got %d entries total", len(got)) - } - if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { t.Fatalf("UpdateStates(running again): %v", err) } + cap.Reset() if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { t.Fatalf("UpdateStates(error): %v", err) } - entries = cap.Entries() - if len(entries) != 2 || entries[1].Kind != metering.KindVMComputeStop || entries[1].Reason != metering.ReasonStopCrash { - t.Fatalf("Running→Error: got %+v, want compute.stop reason=stop-crash as 2nd entry", entries) + if got := cap.Entries(); len(got) != 0 { + t.Errorf("Running→Error must not emit; got %d entries", len(got)) } +} - if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { - t.Fatalf("UpdateStates(error idempotent): %v", err) +func TestPrepareStartClosesStaleInterval(t *testing.T) { + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + dir := t.TempDir() + if err := b.DB.Update(ctx, func(idx *VMIndex) error { + idx.VMs["vm1"].RunDir = dir + idx.VMs["vm1"].LogDir = dir + return nil + }); err != nil { + t.Fatalf("set dirs: %v", err) + } + cap.Reset() + + rec, err := b.PrepareStart(ctx, "vm1", nil) + if err != nil { + t.Fatalf("PrepareStart: %v", err) + } + if rec == nil { + t.Fatal("PrepareStart returned nil (treated as already-running)") } - if got := cap.Entries(); len(got) != 2 { - t.Errorf("Error→Error must not re-emit; got %d entries total", len(got)) + entries := cap.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopCrash { + t.Fatalf("got %+v, want one compute.stop reason=stop-crash", entries) + } + loaded, err := b.LoadRecord(ctx, "vm1") + if err != nil { + t.Fatalf("LoadRecord: %v", err) } - seedVMRecord(t, b, "vm2", 1, 1<<30, 10<<30, false) - if err := b.UpdateStates(ctx, []string{"vm2"}, types.VMStateError); err != nil { - t.Fatalf("UpdateStates(vm2 error from created): %v", err) + if loaded.State != types.VMStateStopped { + t.Errorf("State=%s, want Stopped", loaded.State) } - if got := cap.Entries(); len(got) != 2 { - t.Errorf("Created→Error must not emit; got %d entries total", len(got)) + if loaded.StoppedAt == nil { + t.Error("StoppedAt nil") } } From 4018f61801548181ba5cd1661cc7187294efd650 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 14:01:12 +0800 Subject: [PATCH 11/13] fix(metering): use StoppedAt sentinel for open compute interval + audit sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metering: - UpdateStates(Error) no longer writes StoppedAt — leaves it nil so the compute interval stays open in the ledger. Many MarkError callers (Shutdown error, ctx-cancel, AbortLaunch best-effort) can't prove the process is dead. - closeStaleComputeInterval + DeleteAll + BatchMarkStarted now gate on hasOpenComputeInterval(rec) rather than rec.State == Running. After MarkError leaves State=Error with an open interval, the next start or rm --force confirms the process is dead and closes the account. - New tests cover Running→MarkError→PrepareStart and Running→MarkError→rm --force paths. audit sweep (~80 pre-existing comments deleted/compressed): - images/baseconfig.go, images/{cloudimg,oci}/{config,image,*.go,oci.go} - hypervisor/inspect.go, hypervisor/cloudhypervisor/extend.go - network/bridge/{bridge_other,bridge_linux}.go, network/cni/config.go - console/relay.go, lock/flock/flock.go, storage/json/json.go - gc/orchestrator.go - cmd/{vm,images,others,snapshot}/commands.go --- cmd/images/commands.go | 2 - cmd/others/commands.go | 2 - cmd/snapshot/commands.go | 2 - cmd/vm/commands.go | 2 - console/relay.go | 6 +-- gc/orchestrator.go | 13 ++----- hypervisor/cloudhypervisor/extend.go | 13 +------ hypervisor/inspect.go | 8 +--- hypervisor/start.go | 2 +- hypervisor/state.go | 22 ++++++++--- hypervisor/state_test.go | 58 ++++++++++++++++++++++++++++ hypervisor/stop.go | 2 +- images/baseconfig.go | 30 ++++---------- images/cloudimg/cloudimg.go | 9 +---- images/cloudimg/config.go | 7 +--- images/cloudimg/image.go | 26 ++++--------- images/oci/config.go | 9 ----- images/oci/image.go | 24 ++++-------- images/oci/oci.go | 18 +++------ lock/flock/flock.go | 6 +-- network/bridge/bridge_linux.go | 20 ++++------ network/bridge/bridge_other.go | 24 +++--------- network/cni/config.go | 23 +++-------- storage/json/json.go | 11 ++---- 24 files changed, 138 insertions(+), 201 deletions(-) diff --git a/cmd/images/commands.go b/cmd/images/commands.go index e75df363..33c4f926 100644 --- a/cmd/images/commands.go +++ b/cmd/images/commands.go @@ -6,7 +6,6 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" ) -// Actions defines image management operations. type Actions interface { Pull(cmd *cobra.Command, args []string) error Import(cmd *cobra.Command, args []string) error @@ -15,7 +14,6 @@ type Actions interface { Inspect(cmd *cobra.Command, args []string) error } -// Command builds the "image" parent command with all subcommands. func Command(h Actions) *cobra.Command { imageCmd := &cobra.Command{ Use: "image", diff --git a/cmd/others/commands.go b/cmd/others/commands.go index 2e959936..8c6a1ebb 100644 --- a/cmd/others/commands.go +++ b/cmd/others/commands.go @@ -7,13 +7,11 @@ import ( "github.com/spf13/cobra" ) -// Actions defines cross-cutting system operations. type Actions interface { GC(cmd *cobra.Command, args []string) error Version(cmd *cobra.Command, args []string) error } -// Commands builds system command set (gc, version, completion). func Commands(h Actions) []*cobra.Command { gcCmd := &cobra.Command{ Use: "gc", diff --git a/cmd/snapshot/commands.go b/cmd/snapshot/commands.go index 8aa5fcf7..72a47e0f 100644 --- a/cmd/snapshot/commands.go +++ b/cmd/snapshot/commands.go @@ -6,7 +6,6 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" ) -// Actions defines snapshot management operations. type Actions interface { Save(cmd *cobra.Command, args []string) error List(cmd *cobra.Command, args []string) error @@ -16,7 +15,6 @@ type Actions interface { Import(cmd *cobra.Command, args []string) error } -// Command builds the "snapshot" parent command with all subcommands. func Command(h Actions) *cobra.Command { snapshotCmd := &cobra.Command{ Use: "snapshot", diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index 5c97ede4..041468c9 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -8,7 +8,6 @@ import ( cmdcore "github.com/cocoonstack/cocoon/cmd/core" ) -// Actions defines VM lifecycle operations. type Actions interface { Create(cmd *cobra.Command, args []string) error Run(cmd *cobra.Command, args []string) error @@ -31,7 +30,6 @@ type Actions interface { NetResize(cmd *cobra.Command, args []string) error } -// Command builds the "vm" parent command with all subcommands. func Command(h Actions) *cobra.Command { vmCmd := &cobra.Command{ Use: "vm", diff --git a/console/relay.go b/console/relay.go index 6d8c5c9c..1ca9f261 100644 --- a/console/relay.go +++ b/console/relay.go @@ -10,7 +10,7 @@ import ( "github.com/moby/term" ) -// Relay runs bidirectional I/O with escape-sequence detection; caller closes rw after Relay returns to unblock the second goroutine. +// Relay runs bidirectional I/O with escape-sequence detection. Caller must close rw to unblock the unfinished goroutine. func Relay(rw io.ReadWriter, escapeKeys []byte) error { errCh := make(chan error, 2) //nolint:mnd @@ -28,8 +28,6 @@ func Relay(rw io.ReadWriter, escapeKeys []byte) error { errCh <- err }() - // Wait for the first goroutine to finish. The caller's defer conn.Close() - // unblocks the other goroutine after Relay returns. err := <-errCh if isCleanExit(err) { return nil @@ -37,7 +35,6 @@ func Relay(rw io.ReadWriter, escapeKeys []byte) error { return err } -// FormatEscapeChar returns a human-readable representation of the escape byte. func FormatEscapeChar(b byte) string { if b >= 1 && b <= 0x1F { return "^" + string(rune(b+'@')) @@ -77,7 +74,6 @@ func validateEscapeByte(b byte) (byte, error) { return b, nil } -// isCleanExit returns true for errors that indicate a normal disconnect. func isCleanExit(err error) bool { if err == nil { return true diff --git a/gc/orchestrator.go b/gc/orchestrator.go index 62d2a008..90b7a920 100644 --- a/gc/orchestrator.go +++ b/gc/orchestrator.go @@ -17,21 +17,18 @@ type Orchestrator struct { modules []runner } -// New creates an empty Orchestrator. func New() *Orchestrator { return &Orchestrator{} } -// Register adds a typed Module; package-level (not a method) because Go methods can't have type params. +// Register is package-level because Go methods can't have type params. func Register[S any](o *Orchestrator, m Module[S]) { o.modules = append(o.modules, m) } -// Run executes one GC cycle: lock all modules, snapshot, resolve, collect. -// Fail-closed: any busy lock aborts the cycle so cross-module decisions stay consistent. +// Run executes one GC cycle: lock all → snapshot → resolve → collect. Fail-closed on busy locks to keep cross-module decisions consistent. func (o *Orchestrator) Run(ctx context.Context) error { start := time.Now() logger := log.WithFunc("gc.Run") - // Acquire all locks up front; hold until GC finishes. var locked []runner var skipped []string for _, m := range o.modules { @@ -54,12 +51,10 @@ func (o *Orchestrator) Run(ctx context.Context) error { } }() - // Fail-closed: skip aborts the cycle so cross-module references (e.g. VMs pinning blobs) aren't violated. if len(skipped) > 0 { return fmt.Errorf("gc aborted: modules skipped (lock busy): %s", strings.Join(skipped, ", ")) } - // Phase 1: snapshot all locked modules. snapshots := make(map[string]any, len(locked)) for _, m := range locked { snap, err := m.readSnapshot(ctx) @@ -69,7 +64,6 @@ func (o *Orchestrator) Run(ctx context.Context) error { snapshots[m.getName()] = snap } - // Phase 2: resolve deletion targets (cross-module via snapshots). targets := make(map[string][]string) for _, m := range locked { if ids := m.resolveTargets(ctx, snapshots[m.getName()], snapshots); len(ids) > 0 { @@ -77,7 +71,6 @@ func (o *Orchestrator) Run(ctx context.Context) error { } } - // Phase 3: collect (skip modules with no targets). var errs []error summary := make(map[string]int, len(locked)) failures := 0 @@ -97,7 +90,7 @@ func (o *Orchestrator) Run(ctx context.Context) error { return errors.Join(errs...) } -// formatSummary renders the per-module collection counts as `m1=N m2=M`, sorted by module name. +// formatSummary renders counts as `m1=N m2=M`, sorted. func formatSummary(s map[string]int) string { if len(s) == 0 { return "nothing to collect" diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index 2a33e5d6..1530c63d 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -25,7 +25,6 @@ var ( _ netresize.Resizer = (*CloudHypervisor)(nil) ) -// FsAttach hot-plugs a vhost-user-fs device onto a running CH VM. func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.Spec) (string, error) { if err := spec.Normalize(); err != nil { return "", err @@ -53,7 +52,6 @@ func (ch *CloudHypervisor) FsAttach(ctx context.Context, vmRef string, spec fs.S }) } -// FsDetach removes a previously attached vhost-user-fs device by tag. func (ch *CloudHypervisor) FsDetach(ctx context.Context, vmRef, tag string) error { if tag == "" { return fmt.Errorf("tag is required") @@ -68,7 +66,6 @@ func (ch *CloudHypervisor) FsDetach(ctx context.Context, vmRef, tag string) erro }) } -// FsList enumerates currently attached vhost-user-fs devices. func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attached, error) { return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []fs.Attached { out := make([]fs.Attached, 0, len(info.Config.Fs)) @@ -79,7 +76,6 @@ func (ch *CloudHypervisor) FsList(ctx context.Context, vmRef string) ([]fs.Attac }) } -// DeviceAttach hot-plugs a VFIO PCI passthrough device onto a running CH VM. func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec vfio.Spec) (string, error) { path, err := spec.NormalizedPath() if err != nil { @@ -109,7 +105,6 @@ func (ch *CloudHypervisor) DeviceAttach(ctx context.Context, vmRef string, spec }) } -// DeviceDetach removes a previously attached VFIO device by id. func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) error { if id == "" { return fmt.Errorf("id is required") @@ -124,7 +119,6 @@ func (ch *CloudHypervisor) DeviceDetach(ctx context.Context, vmRef, id string) e }) } -// DeviceList enumerates currently attached VFIO PCI passthrough devices. func (ch *CloudHypervisor) DeviceList(ctx context.Context, vmRef string) ([]vfio.Attached, error) { return listWith(ctx, ch, vmRef, func(info *chVMInfoResponse) []vfio.Attached { out := make([]vfio.Attached, 0, len(info.Config.Devices)) @@ -148,7 +142,6 @@ func (ch *CloudHypervisor) inspectRunning(ctx context.Context, vmRef string) (*h return hc, info, nil } -// attachWith is the shared skeleton for hot-add operations. func (ch *CloudHypervisor) attachWith( ctx context.Context, vmRef, endpoint string, body any, fallbackID string, @@ -184,7 +177,6 @@ func (ch *CloudHypervisor) attachWith( return fallbackID, nil } -// detachWith is the shared skeleton for hot-remove operations. func (ch *CloudHypervisor) detachWith( ctx context.Context, vmRef string, findID func(*chVMInfoResponse) (string, error), @@ -203,13 +195,12 @@ func (ch *CloudHypervisor) detachWith( return nil } -// runningVMClient resolves vmRef, asserts the CH process is alive (matches Backend.WithRunningVM), and returns an http.Client on its API socket. +// runningVMClient asserts the CH process is alive and returns an http.Client on its API socket. func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { hc, _, _, err := ch.runningVMClientWithRecord(ctx, vmRef) return hc, err } -// runningVMClientWithRecord is runningVMClient + the resolved vmID and record. func (ch *CloudHypervisor) runningVMClientWithRecord(ctx context.Context, vmRef string) (*http.Client, string, hypervisor.VMRecord, error) { vmID, err := ch.ResolveRef(ctx, vmRef) if err != nil { @@ -233,7 +224,7 @@ func (ch *CloudHypervisor) runningVMClientWithRecord(ctx context.Context, vmRef return utils.NewSocketHTTPClient(sockPath), vmID, rec, nil } -// listWith is the inspect-time enumeration skeleton; stopped VMs return nil (not an error) so inspect can omit the field cleanly. +// listWith returns nil (not error) for stopped VMs so inspect can omit the field. func listWith[A any]( ctx context.Context, ch *CloudHypervisor, vmRef string, extract func(*chVMInfoResponse) []A, diff --git a/hypervisor/inspect.go b/hypervisor/inspect.go index b3ce0075..5c914034 100644 --- a/hypervisor/inspect.go +++ b/hypervisor/inspect.go @@ -9,7 +9,6 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Inspect returns VM info for a single VM by ref (ID, name, or prefix). func (b *Backend) Inspect(ctx context.Context, ref string) (*types.VM, error) { var result *types.VM return result, b.DB.With(ctx, func(idx *VMIndex) error { @@ -22,7 +21,6 @@ func (b *Backend) Inspect(ctx context.Context, ref string) (*types.VM, error) { }) } -// List returns VM info for all known VMs. func (b *Backend) List(ctx context.Context) ([]*types.VM, error) { var result []*types.VM return result, b.DB.With(ctx, func(idx *VMIndex) error { @@ -31,7 +29,6 @@ func (b *Backend) List(ctx context.Context) ([]*types.VM, error) { }) } -// ToVM converts a stored VMRecord into the runtime types.VM exposed to callers. func (b *Backend) ToVM(rec *VMRecord) *types.VM { info := rec.VM // value copy info.Hypervisor = b.Typ @@ -47,7 +44,6 @@ func (b *Backend) ToVM(rec *VMRecord) *types.VM { return &info } -// ResolveRef resolves a single ref (ID, name, or prefix) to an exact VM ID. func (b *Backend) ResolveRef(ctx context.Context, ref string) (string, error) { var id string return id, b.DB.With(ctx, func(idx *VMIndex) error { @@ -57,7 +53,7 @@ func (b *Backend) ResolveRef(ctx context.Context, ref string) (string, error) { }) } -// ResolveRefs batch-resolves refs to exact VM IDs under a single lock. +// ResolveRefs batch-resolves under a single lock. func (b *Backend) ResolveRefs(ctx context.Context, refs []string) ([]string, error) { var ids []string return ids, b.DB.With(ctx, func(idx *VMIndex) error { @@ -67,7 +63,7 @@ func (b *Backend) ResolveRefs(ctx context.Context, refs []string) ([]string, err }) } -// LoadRecord returns a value-copy of the VMRecord for an exact VM ID. +// LoadRecord returns a value-copy of the VMRecord. func (b *Backend) LoadRecord(ctx context.Context, id string) (VMRecord, error) { var rec VMRecord return rec, b.DB.With(ctx, func(idx *VMIndex) error { diff --git a/hypervisor/start.go b/hypervisor/start.go index 11ac697b..5bfd29ba 100644 --- a/hypervisor/start.go +++ b/hypervisor/start.go @@ -83,7 +83,7 @@ func (b *Backend) PrepareStart(ctx context.Context, id string, runtimeFiles []st case runErr == nil: return nil, nil // already running case errors.Is(runErr, ErrNotRunning): - if rec.State == types.VMStateRunning { + if hasOpenComputeInterval(&rec) { b.closeStaleComputeInterval(ctx, &rec) } default: diff --git a/hypervisor/state.go b/hypervisor/state.go index 47eb2335..84a209df 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -113,7 +113,7 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopUser, shapeFromConfig(r.Config), now)) } case types.VMStateError: - r.StoppedAt = &now + // Don't write StoppedAt — many MarkError paths can't prove the process is dead, so the compute interval stays open in the ledger until a confirmed-dead path (closeStaleComputeInterval / DeleteAll) closes it. } } return nil @@ -145,7 +145,7 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { continue } shape := shapeFromConfig(r.Config) - if r.State == types.VMStateRunning { + if hasOpenComputeInterval(r) { emits = append(emits, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shape, now)) r.StoppedAt = &now } @@ -184,21 +184,31 @@ func (b *Backend) CleanStalePlaceholders(_ context.Context, ids []string) error }) } -// closeStaleComputeInterval flips Running→Stopped and emits stop-crash; precondition: caller confirmed the process is dead. +// closeStaleComputeInterval emits stop-crash and writes StoppedAt; precondition: caller confirmed the process is dead. func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord) { now := time.Now() if err := b.DB.Update(ctx, func(idx *VMIndex) error { r := idx.VMs[rec.ID] - if r == nil || r.State != types.VMStateRunning { + if r == nil || !hasOpenComputeInterval(r) { return nil } - r.State = types.VMStateStopped + if r.State == types.VMStateRunning { + r.State = types.VMStateStopped + } r.StoppedAt = &now r.UpdatedAt = now return nil }); err != nil { - log.WithFunc(b.Typ+".closeStaleComputeInterval").Warnf(ctx, "flip %s to stopped: %v", rec.ID, err) + log.WithFunc(b.Typ+".closeStaleComputeInterval").Warnf(ctx, "close interval for %s: %v", rec.ID, err) return } b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now)) } + +// hasOpenComputeInterval reports whether the VM still has an unclosed compute interval in the ledger. +func hasOpenComputeInterval(r *VMRecord) bool { + if r == nil || r.StartedAt == nil { + return false + } + return r.StoppedAt == nil || r.StartedAt.After(*r.StoppedAt) +} diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 70ec6702..8082e33b 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -153,6 +153,62 @@ func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { } } +func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { + // Running→Error must leave the interval open (UpdateStates(Error) doesn't write StoppedAt). The next PrepareStart confirms the process is dead and closes the interval. + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + dir := t.TempDir() + if err := b.DB.Update(ctx, func(idx *VMIndex) error { + idx.VMs["vm1"].RunDir = dir + idx.VMs["vm1"].LogDir = dir + return nil + }); err != nil { + t.Fatalf("set dirs: %v", err) + } + + b.MarkError(ctx, "vm1") + if got := cap.Entries(); len(got) != 0 { + t.Fatalf("MarkError emitted %d; want 0", len(got)) + } + loaded, _ := b.LoadRecord(ctx, "vm1") + if loaded.StoppedAt != nil { + t.Errorf("MarkError must not write StoppedAt; got %v", loaded.StoppedAt) + } + + rec, err := b.PrepareStart(ctx, "vm1", nil) + if err != nil { + t.Fatalf("PrepareStart: %v", err) + } + if rec == nil { + t.Fatal("PrepareStart returned nil") + } + entries := cap.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopCrash { + t.Fatalf("got %+v, want one compute.stop reason=stop-crash", entries) + } +} + +func TestDeleteForceClosesIntervalAfterMarkError(t *testing.T) { + // rm --force on an Error VM with a still-open interval must emit compute.stop, not just storage.stop. + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + b.MarkError(ctx, "vm1") + cap.Reset() + + loaded, _ := b.LoadRecord(ctx, "vm1") + if !hasOpenComputeInterval(&loaded) { + t.Fatal("interval should still be open after MarkError") + } + + b.emitDeleteClose(ctx, "vm1", shapeFromConfig(loaded.Config), metering.ReasonStopCrash, hasOpenComputeInterval(&loaded)) + entries := cap.Entries() + if len(entries) != 2 { + t.Fatalf("got %d entries, want 2 (compute.stop + storage.stop)", len(entries)) + } +} + func TestPrepareStartClosesStaleInterval(t *testing.T) { b, cap := newMeteringTestBackend(t) ctx := t.Context() @@ -225,7 +281,9 @@ func seedRunningVM(t *testing.T, b *Backend, id string, cpu int, mem, storage in t.Helper() seedVMRecord(t, b, id, cpu, mem, storage, true) if err := b.DB.Update(t.Context(), func(idx *VMIndex) error { + now := time.Now() idx.VMs[id].State = types.VMStateRunning + idx.VMs[id].StartedAt = &now return nil }); err != nil { t.Fatalf("set running: %v", err) diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 9a4b5411..2ed34925 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -115,7 +115,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop if r == nil { return ErrNotFound } - hadRunningInterval = r.State == types.VMStateRunning + hadRunningInterval = hasOpenComputeInterval(r) shape = shapeFromConfig(r.Config) delete(idx.Names, r.Config.Name) delete(idx.VMs, id) diff --git a/images/baseconfig.go b/images/baseconfig.go index 9c442522..c7dc19a3 100644 --- a/images/baseconfig.go +++ b/images/baseconfig.go @@ -7,38 +7,24 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// BaseConfig holds the common directory layout shared by all image backends. -// Each backend embeds BaseConfig and adds type-specific paths. +// BaseConfig is the directory layout shared by all image backends. type BaseConfig struct { Root *config.Config - Subdir string // backend subdirectory under RootDir, e.g. "oci" or "cloudimg" - BlobExt string // blob file extension, e.g. ".erofs" or ".qcow2" + Subdir string + BlobExt string } -// BackendDir returns the root directory for this image backend. func (c *BaseConfig) BackendDir() string { return filepath.Join(c.Root.RootDir, c.Subdir) } +func (c *BaseConfig) DBDir() string { return filepath.Join(c.BackendDir(), "db") } +func (c *BaseConfig) TempDir() string { return filepath.Join(c.BackendDir(), "temp") } +func (c *BaseConfig) BlobsDir() string { return filepath.Join(c.BackendDir(), "blobs") } +func (c *BaseConfig) IndexFile() string { return filepath.Join(c.DBDir(), "images.json") } +func (c *BaseConfig) IndexLock() string { return filepath.Join(c.DBDir(), "images.lock") } -// DBDir returns the database directory path. -func (c *BaseConfig) DBDir() string { return filepath.Join(c.BackendDir(), "db") } - -// TempDir returns the temporary working directory path. -func (c *BaseConfig) TempDir() string { return filepath.Join(c.BackendDir(), "temp") } - -// BlobsDir returns the blob storage directory path. -func (c *BaseConfig) BlobsDir() string { return filepath.Join(c.BackendDir(), "blobs") } - -// IndexFile returns the path to the image index JSON file. -func (c *BaseConfig) IndexFile() string { return filepath.Join(c.DBDir(), "images.json") } - -// IndexLock returns the path to the image index lock file. -func (c *BaseConfig) IndexLock() string { return filepath.Join(c.DBDir(), "images.lock") } - -// BlobPath returns the full path for a blob with the given digest hex. func (c *BaseConfig) BlobPath(hex string) string { return filepath.Join(c.BlobsDir(), hex+c.BlobExt) } -// EnsureBaseDirs creates the common directories (db, temp, blobs). func (c *BaseConfig) EnsureBaseDirs() error { return utils.EnsureDirs(c.DBDir(), c.TempDir(), c.BlobsDir()) } diff --git a/images/cloudimg/cloudimg.go b/images/cloudimg/cloudimg.go index ea7a6e61..d82962b0 100644 --- a/images/cloudimg/cloudimg.go +++ b/images/cloudimg/cloudimg.go @@ -30,7 +30,6 @@ type CloudImg struct { ops images.Ops[imageIndex, imageEntry] } -// New creates a new cloud image backend. func New(ctx context.Context, conf *config.Config) (*CloudImg, error) { cfg := NewConfig(conf) if err := cfg.EnsureDirs(); err != nil { @@ -55,10 +54,8 @@ func New(ctx context.Context, conf *config.Config) (*CloudImg, error) { return c, nil } -// Type returns the image backend identifier. func (c *CloudImg) Type() string { return typ } -// Pull downloads a cloud image and stores it in the blob cache. func (c *CloudImg) Pull(ctx context.Context, url string, force bool, tracker progress.Tracker) error { _, err, _ := c.pullGroup.Do(url, func() (any, error) { return nil, pull(ctx, c.conf, c.store, url, force, tracker) @@ -66,7 +63,6 @@ func (c *CloudImg) Pull(ctx context.Context, url string, force bool, tracker pro return err } -// Import imports local qcow2 file(s) as a cloud image. func (c *CloudImg) Import(ctx context.Context, name string, tracker progress.Tracker, file ...string) error { if len(file) == 1 { return importQcow2File(ctx, c.conf, c.store, name, tracker, file[0]) @@ -74,22 +70,19 @@ func (c *CloudImg) Import(ctx context.Context, name string, tracker progress.Tra return importQcow2Concat(ctx, c.conf, c.store, name, tracker, file...) } -// ImportFromReader imports a qcow2 image from a reader (stdin, gzip stream, etc.). func (c *CloudImg) ImportFromReader(ctx context.Context, name string, tracker progress.Tracker, r io.Reader) error { return importQcow2Reader(ctx, c.conf, c.store, name, tracker, r) } -// Inspect returns the record for a single image. Returns (nil, nil) if not found. +// Inspect returns (nil, nil) if not found. func (c *CloudImg) Inspect(ctx context.Context, id string) (*types.Image, error) { return c.ops.Inspect(ctx, id) } -// List returns all locally stored cloud images. func (c *CloudImg) List(ctx context.Context) ([]*types.Image, error) { return c.ops.List(ctx) } -// Delete removes images from the index. func (c *CloudImg) Delete(ctx context.Context, ids []string) ([]string, error) { return c.ops.Delete(ctx, ids) } diff --git a/images/cloudimg/config.go b/images/cloudimg/config.go index 7c727a20..a9bc6fd2 100644 --- a/images/cloudimg/config.go +++ b/images/cloudimg/config.go @@ -7,29 +7,26 @@ import ( "github.com/cocoonstack/cocoon/images" ) -// Config holds cloud image backend specific configuration, embedding the shared BaseConfig. type Config struct { images.BaseConfig } -// NewConfig creates a Config from a global config. func NewConfig(conf *config.Config) *Config { return &Config{BaseConfig: images.BaseConfig{ Root: conf, Subdir: "cloudimg", BlobExt: ".qcow2", }} } -// EnsureDirs creates all required directories for the cloudimg backend. func (c *Config) EnsureDirs() error { return c.EnsureBaseDirs() } -// FirmwarePath returns the path to the UEFI firmware blob (CLOUDHV.fd). +// FirmwarePath returns the UEFI firmware blob (CLOUDHV.fd) under conf.RootDir/firmware. func (c *Config) FirmwarePath() string { return filepath.Join(c.Root.RootDir, "firmware", "CLOUDHV.fd") } -// tmpBlobPath returns temp blob path for digest; naming ensures safe last-writer-wins collision handling. +// tmpBlobPath uses a hidden prefix so a partial write is safe under last-writer-wins. func (c *Config) tmpBlobPath(digestHex string) string { return filepath.Join(c.TempDir(), ".tmp-"+digestHex+".qcow2") } diff --git a/images/cloudimg/image.go b/images/cloudimg/image.go index 65f8a20d..d73805d4 100644 --- a/images/cloudimg/image.go +++ b/images/cloudimg/image.go @@ -11,20 +11,17 @@ type imageIndex struct { } type imageEntry struct { - Ref string `json:"ref"` // Original URL. - ContentSum images.Digest `json:"content_sum"` // SHA-256 of downloaded content. - Size int64 `json:"size"` // qcow2 blob size on disk. + Ref string `json:"ref"` + ContentSum images.Digest `json:"content_sum"` + Size int64 `json:"size"` CreatedAt time.Time `json:"created_at"` } -// Lookup finds an image entry by URL or content digest. -// Returns the ref key, entry, and whether it was found. +// Lookup finds an entry by URL or content digest. func (idx *imageIndex) Lookup(id string) (string, *imageEntry, bool) { - // Exact URL match. if entry, ok := idx.Images[id]; ok && entry != nil { return id, entry, true } - // Search by content digest. for ref, entry := range idx.Images { if entry != nil && (entry.ContentSum.String() == id || entry.ContentSum.Hex() == id) { return ref, entry, true @@ -33,23 +30,14 @@ func (idx *imageIndex) Lookup(id string) (string, *imageEntry, bool) { return "", nil, false } -// LookupRefs returns all ref keys matching id for DeleteByID. -// Delegates to shared images.LookupRefs (no normalizers needed for URLs). func (idx *imageIndex) LookupRefs(id string) []string { return images.LookupRefs(idx.Images, id) } -// EntryID returns content checksum as unique entry identifier. -func (e imageEntry) EntryID() string { return e.ContentSum.String() } - -// EntryRef returns image reference string. -func (e imageEntry) EntryRef() string { return e.Ref } - -// EntryCreatedAt returns when this entry was created. +func (e imageEntry) EntryID() string { return e.ContentSum.String() } +func (e imageEntry) EntryRef() string { return e.Ref } func (e imageEntry) EntryCreatedAt() time.Time { return e.CreatedAt } - -// DigestHexes returns hex-encoded content digest. -func (e imageEntry) DigestHexes() []string { return []string{e.ContentSum.Hex()} } +func (e imageEntry) DigestHexes() []string { return []string{e.ContentSum.Hex()} } func imageSizer(e *imageEntry) int64 { return e.Size diff --git a/images/oci/config.go b/images/oci/config.go index b02c14ca..ab6080fa 100644 --- a/images/oci/config.go +++ b/images/oci/config.go @@ -8,19 +8,16 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Config holds OCI image backend specific configuration, embedding the shared BaseConfig. type Config struct { images.BaseConfig } -// NewConfig creates an OCI Config from a global config. func NewConfig(conf *config.Config) *Config { return &Config{BaseConfig: images.BaseConfig{ Root: conf, Subdir: "oci", BlobExt: ".erofs", }} } -// EnsureDirs creates all required directories for the OCI backend. func (c *Config) EnsureDirs() error { if err := c.EnsureBaseDirs(); err != nil { return err @@ -28,22 +25,16 @@ func (c *Config) EnsureDirs() error { return utils.EnsureDirs(c.BootBaseDir()) } -// OCI-specific paths. - -// BootBaseDir returns the root directory for extracted boot files. func (c *Config) BootBaseDir() string { return filepath.Join(c.BackendDir(), "boot") } -// BootDir returns the boot directory for a specific layer digest. func (c *Config) BootDir(layerDigestHex string) string { return filepath.Join(c.BootBaseDir(), layerDigestHex) } -// KernelPath returns the vmlinuz path for a specific layer digest. func (c *Config) KernelPath(layerDigestHex string) string { return filepath.Join(c.BootDir(layerDigestHex), "vmlinuz") } -// InitrdPath returns the initrd.img path for a specific layer digest. func (c *Config) InitrdPath(layerDigestHex string) string { return filepath.Join(c.BootDir(layerDigestHex), "initrd.img") } diff --git a/images/oci/image.go b/images/oci/image.go index afcecdb8..cd13d545 100644 --- a/images/oci/image.go +++ b/images/oci/image.go @@ -12,14 +12,14 @@ type imageIndex struct { images.Index[imageEntry] } -// Paths are not stored; they are derived from digests and config at runtime. +// Paths derive from digests at runtime; not stored. type imageEntry struct { Ref string `json:"ref"` ManifestDigest images.Digest `json:"manifest_digest"` Layers []layerEntry `json:"layers"` - KernelLayer images.Digest `json:"kernel_layer"` // digest of layer containing vmlinuz - InitrdLayer images.Digest `json:"initrd_layer"` // digest of layer containing initrd.img - Size int64 `json:"size"` // total on-disk size of all artifacts + KernelLayer images.Digest `json:"kernel_layer"` + InitrdLayer images.Digest `json:"initrd_layer"` + Size int64 `json:"size"` CreatedAt time.Time `json:"created_at"` } @@ -27,13 +27,11 @@ type layerEntry struct { Digest images.Digest `json:"digest"` } -// Lookup finds an image entry by ref (exact or normalized) or manifest digest. -// Returns the ref key, entry, and whether it was found. +// Lookup finds an entry by ref (exact or normalized) or manifest digest. func (idx *imageIndex) Lookup(id string) (string, *imageEntry, bool) { if entry, ok := idx.Images[id]; ok && entry != nil { return id, entry, true } - // Try normalizing as an image reference (e.g., "ubuntu:24.04" -> "docker.io/library/ubuntu:24.04"). if parsed, err := name.ParseReference(id); err == nil { normalized := parsed.String() if entry, ok := idx.Images[normalized]; ok && entry != nil { @@ -48,8 +46,6 @@ func (idx *imageIndex) Lookup(id string) (string, *imageEntry, bool) { return "", nil, false } -// LookupRefs returns all ref keys matching id for DeleteByID. -// Delegates to shared images.LookupRefs with OCI reference normalization. func (idx *imageIndex) LookupRefs(id string) []string { return images.LookupRefs(idx.Images, id, func(s string) (string, bool) { parsed, err := name.ParseReference(s) @@ -60,16 +56,10 @@ func (idx *imageIndex) LookupRefs(id string) []string { }) } -// EntryID returns manifest digest as unique entry identifier. -func (e imageEntry) EntryID() string { return e.ManifestDigest.String() } - -// EntryRef returns image reference string. -func (e imageEntry) EntryRef() string { return e.Ref } - -// EntryCreatedAt returns when this entry was created. +func (e imageEntry) EntryID() string { return e.ManifestDigest.String() } +func (e imageEntry) EntryRef() string { return e.Ref } func (e imageEntry) EntryCreatedAt() time.Time { return e.CreatedAt } -// DigestHexes returns hex-encoded digests of all layers. func (e imageEntry) DigestHexes() []string { hexes := make([]string, len(e.Layers)) for i, l := range e.Layers { diff --git a/images/oci/oci.go b/images/oci/oci.go index efcf96a2..048c112a 100644 --- a/images/oci/oci.go +++ b/images/oci/oci.go @@ -25,7 +25,7 @@ const ( var _ images.Images = (*OCI)(nil) -// OCI implements the images.Images interface using OCI container images converted to EROFS filesystems for use with Cloud Hypervisor. +// OCI converts OCI container layers to EROFS for Cloud Hypervisor. type OCI struct { conf *Config store storage.Store[imageIndex] @@ -34,7 +34,6 @@ type OCI struct { ops images.Ops[imageIndex, imageEntry] } -// New creates a new OCI image backend. func New(ctx context.Context, conf *config.Config) (*OCI, error) { if conf == nil { return nil, fmt.Errorf("config is nil") @@ -62,10 +61,9 @@ func New(ctx context.Context, conf *config.Config) (*OCI, error) { return o, nil } -// Type returns the image backend identifier. func (o *OCI) Type() string { return typ } -// Pull downloads an OCI image from a container registry, extracts boot files (kernel, initrd), and converts each layer to EROFS concurrently. +// Pull downloads an OCI image, extracts boot files, and converts each layer to EROFS concurrently. func (o *OCI) Pull(ctx context.Context, image string, _ bool, tracker progress.Tracker) error { _, err, _ := o.pullGroup.Do(image, func() (any, error) { return nil, pull(ctx, o.conf, o.store, image, tracker) @@ -73,34 +71,30 @@ func (o *OCI) Pull(ctx context.Context, image string, _ bool, tracker progress.T return err } -// Import imports local tar files as an OCI image. -// Each tar file becomes one EROFS layer (ordered by the files slice). +// Import: each tar file becomes one EROFS layer in the order of the files slice. func (o *OCI) Import(ctx context.Context, name string, tracker progress.Tracker, file ...string) error { return importTarLayers(ctx, o.conf, o.store, name, tracker, file...) } -// ImportFromReader imports a single tar layer from a reader (stdin, gzip stream, etc.). func (o *OCI) ImportFromReader(ctx context.Context, name string, tracker progress.Tracker, r io.Reader) error { return importTarFromReader(ctx, o.conf, o.store, name, tracker, r) } -// Inspect returns the record for a single image. Returns (nil, nil) if not found. +// Inspect returns (nil, nil) if not found. func (o *OCI) Inspect(ctx context.Context, id string) (*types.Image, error) { return o.ops.Inspect(ctx, id) } -// List returns all locally stored images. func (o *OCI) List(ctx context.Context) ([]*types.Image, error) { return o.ops.List(ctx) } -// Delete removes images from the index. -// Returns the list of actually deleted refs. Images not found are logged and skipped. +// Delete returns actually-deleted refs; not-found ids are logged and skipped. func (o *OCI) Delete(ctx context.Context, ids []string) ([]string, error) { return o.ops.Delete(ctx, ids) } -// Config generates StorageConfig + BootConfig for the given VMs; paths derive from layer digests, refs are normalized, errors if a blob is missing. +// Config builds StorageConfig + BootConfig from layer digests; errors if any blob is missing. func (o *OCI) Config(ctx context.Context, vms []*types.VMConfig) (result [][]*types.StorageConfig, boot []*types.BootConfig, err error) { err = o.store.With(ctx, func(idx *imageIndex) error { result = make([][]*types.StorageConfig, len(vms)) diff --git a/lock/flock/flock.go b/lock/flock/flock.go index 8dd7d820..a6d55bac 100644 --- a/lock/flock/flock.go +++ b/lock/flock/flock.go @@ -21,12 +21,10 @@ type Lock struct { fl *flock.Flock // active flock fd, non-nil while held } -// New creates a Lock for the given path. func New(path string) *Lock { return &Lock{path: path, ch: make(chan struct{}, 1)} } -// Lock acquires the lock, blocking until available or ctx is canceled. func (l *Lock) Lock(ctx context.Context) error { select { case l.ch <- struct{}{}: @@ -45,8 +43,7 @@ func (l *Lock) Lock(ctx context.Context) error { return nil } -// TryLock attempts a non-blocking acquisition. -// Returns (false, nil) if the lock is currently held by another caller. +// TryLock returns (false, nil) if the lock is currently held by another caller. func (l *Lock) TryLock(_ context.Context) (bool, error) { select { case l.ch <- struct{}{}: @@ -58,7 +55,6 @@ func (l *Lock) TryLock(_ context.Context) (bool, error) { }) } -// Unlock releases the lock. func (l *Lock) Unlock(_ context.Context) error { var err error if l.fl != nil { diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go index 7214a751..27fa1e43 100644 --- a/network/bridge/bridge_linux.go +++ b/network/bridge/bridge_linux.go @@ -21,14 +21,14 @@ const typ = "bridge" var _ network.Network = (*Bridge)(nil) -// Bridge implements network.Network as TAP-on-bridge. Requires a pre-existing bridge with DHCP + routing (e.g. cocoon-net's cni0). +// Bridge is TAP-on-bridge; requires a pre-existing bridge with DHCP + routing. type Bridge struct { conf *config.Config bridgeDev string bridgeIdx int } -// New creates a Bridge network provider. The bridge device must exist. +// New: the bridge device must already exist. func New(conf *config.Config, bridgeDev string) (*Bridge, error) { if conf == nil { return nil, fmt.Errorf("config is nil") @@ -50,10 +50,8 @@ func New(conf *config.Config, bridgeDev string) (*Bridge, error) { }, nil } -// Type returns the provider identifier. func (b *Bridge) Type() string { return typ } -// Verify checks whether the TAP for a VM exists. func (b *Bridge) Verify(_ context.Context, vmID string) error { if _, err := netlink.LinkByName(tapName(vmID, 0)); err != nil { return fmt.Errorf("tap %s: %w", tapName(vmID, 0), err) @@ -61,12 +59,11 @@ func (b *Bridge) Verify(_ context.Context, vmID string) error { return nil } -// Prepare is a no-op for bridge mode. +// Prepare is a no-op (bridge has no netns). func (b *Bridge) Prepare(_ context.Context, _ string, _ *types.VMConfig) (string, error) { return "", nil } -// Add allocates TAP devices on the bridge for the given specs. func (b *Bridge) Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, specs ...network.AddSpec) (configs []*types.NetworkConfig, retErr error) { if len(specs) == 0 { return nil, nil @@ -131,33 +128,30 @@ func (b *Bridge) Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, sp return configs, nil } -// Remove deletes the TAP devices for the given indices. func (b *Bridge) Remove(_ context.Context, vmID string, indices ...int) error { return tearDownTAPs(vmID, indices, false) } -// Delete removes TAP devices for the given VMs. func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { return CleanupTAPs(vmIDs), nil } -// Inspect is not supported — bridge mode has no persistent records. +// Inspect: bridge has no persistent records. func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { return nil, nil } -// List is not supported — bridge mode has no persistent records. +// List: bridge has no persistent records. func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { return nil, nil } -// RegisterGC registers the bridge GC module that reclaims orphan bt* TAP devices. +// RegisterGC reclaims orphan bt* TAP devices. func (b *Bridge) RegisterGC(orch *gc.Orchestrator) { gc.Register(orch, GCModule(b.conf.RootDir)) } -// CleanupTAPs probes and removes bridge TAP devices for the given VM IDs. -// No-op per VM if none exist; safe without a Bridge instance. +// CleanupTAPs removes bridge TAP devices per VM ID; safe without a Bridge instance. func CleanupTAPs(vmIDs []string) []string { cleaned := make([]string, 0, len(vmIDs)) for _, vmID := range vmIDs { diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go index 5bdfc38e..c2b30287 100644 --- a/network/bridge/bridge_other.go +++ b/network/bridge/bridge_other.go @@ -1,5 +1,6 @@ //go:build !linux +// Package bridge: non-Linux stubs. All Bridge methods return errUnsupported; CleanupTAPs is a no-op. package bridge import ( @@ -15,46 +16,33 @@ import ( var errUnsupported = fmt.Errorf("bridge TAP networking requires Linux (running on %s)", runtime.GOOS) -// Bridge is a placeholder for non-Linux. type Bridge struct{} -// New returns an error on non-Linux. func New(_ *config.Config, _ string) (*Bridge, error) { return nil, errUnsupported } -// Type returns the provider identifier. -func (b *Bridge) Type() string { return "bridge" } - -// Verify is not supported. +func (b *Bridge) Type() string { return "bridge" } func (b *Bridge) Verify(_ context.Context, _ string) error { return errUnsupported } +func (b *Bridge) Remove(_ context.Context, _ string, _ ...int) error { + return errUnsupported +} +func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} -// Prepare is not supported. func (b *Bridge) Prepare(_ context.Context, _ string, _ *types.VMConfig) (string, error) { return "", errUnsupported } -// Add is not supported. func (b *Bridge) Add(_ context.Context, _ string, _ *types.VMConfig, _ ...network.AddSpec) ([]*types.NetworkConfig, error) { return nil, errUnsupported } -// Remove is not supported. -func (b *Bridge) Remove(_ context.Context, _ string, _ ...int) error { return errUnsupported } - -// Delete is not supported. func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } -// Inspect is not supported. func (b *Bridge) Inspect(_ context.Context, _ string) (*types.Network, error) { return nil, errUnsupported } -// List is not supported. func (b *Bridge) List(_ context.Context) ([]*types.Network, error) { return nil, errUnsupported } -// RegisterGC is a no-op. -func (b *Bridge) RegisterGC(_ *gc.Orchestrator) {} - -// CleanupTAPs is a no-op on non-Linux. func CleanupTAPs(_ []string) []string { return nil } diff --git a/network/cni/config.go b/network/cni/config.go index 8b528394..172f5136 100644 --- a/network/cni/config.go +++ b/network/cni/config.go @@ -9,41 +9,28 @@ import ( const ( netnsBasePath = "/var/run/netns" - // netnsPrefix prevents GC from deleting netns created by other tools - // (docker, containerd, etc.). Only netns matching this prefix are managed. + // netnsPrefix scopes GC to cocoon-owned netns (so docker/containerd entries survive). netnsPrefix = "cocoon-" ) -// Config holds CNI network provider specific configuration, embedding the global config. type Config struct { *config.Config } -// EnsureDirs creates all static directories required by the CNI network provider. func (c *Config) EnsureDirs() error { - return utils.EnsureDirs( - c.dbDir(), - ) + return utils.EnsureDirs(c.dbDir()) } -// IndexFile returns the path to the network index JSON file. func (c *Config) IndexFile() string { return filepath.Join(c.dbDir(), "networks.json") } - -// IndexLock returns the path to the network index lock file. func (c *Config) IndexLock() string { return filepath.Join(c.dbDir(), "networks.lock") } +func (c *Config) CacheDir() string { return filepath.Join(c.dir(), "cache") } +func (c *Config) dir() string { return filepath.Join(c.RootDir, "cni") } +func (c *Config) dbDir() string { return filepath.Join(c.dir(), "db") } -// CacheDir returns the CNI result cache directory path. -func (c *Config) CacheDir() string { return filepath.Join(c.dir(), "cache") } - -func (c *Config) dir() string { return filepath.Join(c.RootDir, "cni") } -func (c *Config) dbDir() string { return filepath.Join(c.dir(), "db") } - -// netnsPath returns the named netns path for a VM. func netnsPath(vmID string) string { return filepath.Join(netnsBasePath, netnsPrefix+vmID) } -// netnsName returns the named netns name (without path) for a VM. func netnsName(vmID string) string { return netnsPrefix + vmID } diff --git a/storage/json/json.go b/storage/json/json.go index 5dab72b1..5181d965 100644 --- a/storage/json/json.go +++ b/storage/json/json.go @@ -19,12 +19,11 @@ type Store[T any] struct { locker lock.Locker } -// New creates a JSON file-backed store with the given lock. func New[T any](filePath string, locker lock.Locker) *Store[T] { return &Store[T]{filePath: filePath, locker: locker} } -// ReadRaw loads the JSON file and passes the decoded data to fn without locking. +// ReadRaw loads the JSON file unlocked. func (s *Store[T]) ReadRaw(fn func(*T) error) error { data, err := s.load() if err != nil { @@ -33,7 +32,7 @@ func (s *Store[T]) ReadRaw(fn func(*T) error) error { return fn(data) } -// WriteRaw loads, mutates via fn, and atomically writes back without locking. +// WriteRaw loads, mutates, atomically writes back — unlocked. func (s *Store[T]) WriteRaw(fn func(*T) error) error { data, err := s.load() if err != nil { @@ -48,22 +47,20 @@ func (s *Store[T]) WriteRaw(fn func(*T) error) error { return nil } -// With acquires the lock, loads the data, and passes it to fn read-only. +// With runs fn read-only under the store lock. func (s *Store[T]) With(ctx context.Context, fn func(*T) error) error { return s.withLocked(ctx, func() error { return s.ReadRaw(fn) }) } -// Update acquires the lock, loads, mutates via fn, and atomically writes back. +// Update runs fn read-modify-write under the store lock. func (s *Store[T]) Update(ctx context.Context, fn func(*T) error) error { return s.withLocked(ctx, func() error { return s.WriteRaw(fn) }) } -// TryLock attempts to acquire the store lock without blocking. func (s *Store[T]) TryLock(ctx context.Context) (bool, error) { return s.locker.TryLock(ctx) } -// Unlock releases the store lock. func (s *Store[T]) Unlock(ctx context.Context) error { return s.locker.Unlock(ctx) } From b9ea174fc23a2bfa9169fe9bf5f68bbb5c518ca0 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 14:18:05 +0800 Subject: [PATCH 12/13] =?UTF-8?q?fix(metering):=20emit=20compute.stop=20on?= =?UTF-8?q?=20Error=E2=86=92Stopped=20(recovery=20stop)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UpdateStates(Stopped) now gates emit on hasOpenComputeInterval(r) rather than oldState==Running. After MarkError leaves an open interval, a later vm stop --force / rm --force succeeds → UpdateStates writes the StoppedAt sentinel and emits the missing compute.stop(stop-user). Idempotent Stopped→Stopped no longer overwrites StoppedAt (preserves first stop). Regression: TestStopAfterMarkErrorEmitsComputeStop covers Running→MarkError→Stopped. --- hypervisor/state.go | 9 +++------ hypervisor/state_test.go | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/hypervisor/state.go b/hypervisor/state.go index 84a209df..d98003b5 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -88,7 +88,7 @@ func (b *Backend) WithPausedVM(ctx context.Context, rec *VMRecord, pause, resume }) } -// UpdateStates batch-updates State + StartedAt/StoppedAt; only Running→Stopped emits compute.stop (Error paths can't prove the process is dead). +// UpdateStates batch-updates State; transitions to Stopped close the compute interval when one is open (covers Error→Stopped from rm --force or recovery stop). Error transitions leave StoppedAt nil because many MarkError paths can't prove the process is dead. func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VMState) error { if len(ids) == 0 { return nil @@ -101,19 +101,16 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM if r == nil { continue } - oldState := r.State r.State = state r.UpdatedAt = now switch state { case types.VMStateRunning: r.StartedAt = &now case types.VMStateStopped: - r.StoppedAt = &now - if oldState == types.VMStateRunning { + if hasOpenComputeInterval(r) { + r.StoppedAt = &now stopped = append(stopped, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopUser, shapeFromConfig(r.Config), now)) } - case types.VMStateError: - // Don't write StoppedAt — many MarkError paths can't prove the process is dead, so the compute interval stays open in the ledger until a confirmed-dead path (closeStaleComputeInterval / DeleteAll) closes it. } } return nil diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index 8082e33b..e77fca7a 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -189,6 +189,27 @@ func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { } } +func TestStopAfterMarkErrorEmitsComputeStop(t *testing.T) { + // Running→Error→Stopped: MarkError leaves the interval open; the recovery stop confirms the process is dead and must close it. + b, cap := newMeteringTestBackend(t) + ctx := t.Context() + seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) + b.MarkError(ctx, "vm1") + cap.Reset() + + if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { + t.Fatalf("UpdateStates(stopped): %v", err) + } + entries := cap.Entries() + if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopUser { + t.Fatalf("got %+v, want one compute.stop reason=user", entries) + } + loaded, _ := b.LoadRecord(ctx, "vm1") + if loaded.StoppedAt == nil { + t.Error("StoppedAt nil after Stopped transition") + } +} + func TestDeleteForceClosesIntervalAfterMarkError(t *testing.T) { // rm --force on an Error VM with a still-open interval must emit compute.stop, not just storage.stop. b, cap := newMeteringTestBackend(t) From 9f2f21ca3891ce9f668ed4bafdcb79f024c99650 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 20 May 2026 14:34:54 +0800 Subject: [PATCH 13/13] fix(metering): clock-safe sentinel + fail-closed restore-emit + cap rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three concerns from the final audit round: 1. hasOpenComputeInterval was StartedAt > StoppedAt — NTP backward step could flip the comparison and miss-emit. Switch to the cleaner sentinel: StoppedAt == nil. Every transition into Running (UpdateStates, BatchMarkStarted, FinalizeRestore) now clears StoppedAt explicitly. 2. emitRestoreComputeStop wrote a ledger entry even when DB.Update failed or the record had vanished mid-flight. Now fail-closed on DB error, skip emit when the closure didn't actually close, and route through the new makeSourceEntry helper. closeStaleComputeInterval gets the same didClose gate. 3. cap (the builtin) was shadowed 20× across state_test.go, localfile_test.go, gc_test.go. Renamed to rec. Layout: hypervisor/backend.go now keeps Backend's Type() method adjacent to NewBackend instead of after the 7 Spec types. --- hypervisor/backend.go | 4 +- hypervisor/metering.go | 24 ++++---- hypervisor/restore.go | 29 +++++----- hypervisor/state.go | 17 +++--- hypervisor/state_test.go | 84 ++++++++++++++-------------- snapshot/localfile/gc_test.go | 12 ++-- snapshot/localfile/localfile_test.go | 20 +++---- 7 files changed, 96 insertions(+), 94 deletions(-) diff --git a/hypervisor/backend.go b/hypervisor/backend.go index c91d809c..f8573152 100644 --- a/hypervisor/backend.go +++ b/hypervisor/backend.go @@ -88,6 +88,8 @@ func NewBackend(typ string, conf BackendConfig, rec metering.Recorder) (*Backend }, nil } +func (b *Backend) Type() string { return b.Typ } + // LaunchSpec is the per-call input to Backend.LaunchVMProcess. type LaunchSpec struct { Cmd *exec.Cmd @@ -152,5 +154,3 @@ type SnapshotSpec struct { AfterCapture func(rec *VMRecord, tmpDir string) error BuildMeta func(rec *VMRecord, tmpDir string) (*SnapshotMeta, error) } - -func (b *Backend) Type() string { return b.Typ } diff --git a/hypervisor/metering.go b/hypervisor/metering.go index 80a32171..49ee0ed3 100644 --- a/hypervisor/metering.go +++ b/hypervisor/metering.go @@ -15,6 +15,13 @@ func (b *Backend) makeEntry(kind metering.Kind, vmID string, reason metering.Rea } } +// makeSourceEntry stamps an entry carrying SourceSnapshotID (clone/restore lineage). +func (b *Backend) makeSourceEntry(kind metering.Kind, vmID, sourceSnapshotID string, reason metering.Reason, shape metering.Shape, now time.Time) metering.Entry { + e := b.makeEntry(kind, vmID, reason, shape, now) + e.SourceSnapshotID = sourceSnapshotID + return e +} + func (b *Backend) emitAll(ctx context.Context, entries []metering.Entry) { for _, e := range entries { b.Metering.Emit(ctx, e) @@ -23,30 +30,19 @@ func (b *Backend) emitAll(ctx context.Context, entries []metering.Entry) { // emitOpenInterval fires the storage.start + compute.start pair; caller-provided now keeps adjacent stop/start timestamps aligned. func (b *Backend) emitOpenInterval(ctx context.Context, vm *types.VM, reason metering.Reason, sourceSnapshotID string, now time.Time) { - rec := b.Metering shape := shapeFromConfig(vm.Config) for _, kind := range []metering.Kind{metering.KindVMStorageStart, metering.KindVMComputeStart} { - rec.Emit(ctx, metering.Entry{ - Kind: kind, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, - Reason: reason, Hypervisor: b.Typ, Shape: shape, EmittedAt: now, - }) + b.Metering.Emit(ctx, b.makeSourceEntry(kind, vm.ID, sourceSnapshotID, reason, shape, now)) } } // emitDeleteClose fires storage.stop unconditionally; compute.stop only when an interval was open. func (b *Backend) emitDeleteClose(ctx context.Context, vmID string, shape metering.Shape, computeReason metering.Reason, hadRunningInterval bool) { now := time.Now() - rec := b.Metering if hadRunningInterval { - rec.Emit(ctx, metering.Entry{ - Kind: metering.KindVMComputeStop, VMID: vmID, Reason: computeReason, - Hypervisor: b.Typ, Shape: shape, EmittedAt: now, - }) + b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, vmID, computeReason, shape, now)) } - rec.Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStop, VMID: vmID, Reason: metering.ReasonVMRemove, - Hypervisor: b.Typ, Shape: shape, EmittedAt: now, - }) + b.Metering.Emit(ctx, b.makeEntry(metering.KindVMStorageStop, vmID, metering.ReasonVMRemove, shape, now)) } func shapeFromConfig(c types.VMConfig) metering.Shape { diff --git a/hypervisor/restore.go b/hypervisor/restore.go index 6c463a63..3645c226 100644 --- a/hypervisor/restore.go +++ b/hypervisor/restore.go @@ -50,6 +50,7 @@ func (b *Backend) FinalizeRestore(ctx context.Context, vmID string, vmCfg *types r.Config = *vmCfg r.State = types.VMStateRunning r.StartedAt = &now + r.StoppedAt = nil r.UpdatedAt = now return nil }); err != nil { @@ -157,32 +158,34 @@ func (b *Backend) DirectRestoreSequence(ctx context.Context, vmRef string, spec return result, nil } -// emitRestoreComputeStop closes the compute interval and flips State→Stopped so a later MarkError won't re-emit; storage stays open until vm rm. +// emitRestoreComputeStop closes the compute interval after a confirmed kill; fail-closed on DB error and skip on vanished record so the ledger never gets a phantom entry. func (b *Backend) emitRestoreComputeStop(ctx context.Context, vmID string, oldShape metering.Shape, sourceSnapshotID string) { now := time.Now() + closed := false if err := b.DB.Update(ctx, func(idx *VMIndex) error { - if r := idx.VMs[vmID]; r != nil { - r.State = types.VMStateStopped - r.StoppedAt = &now - r.UpdatedAt = now + r := idx.VMs[vmID] + if r == nil || !hasOpenComputeInterval(r) { + return nil } + r.State = types.VMStateStopped + r.StoppedAt = &now + r.UpdatedAt = now + closed = true return nil }); err != nil { log.WithFunc(b.Typ+".emitRestoreComputeStop").Warnf(ctx, "mark stopped after kill %s: %v", vmID, err) + return + } + if !closed { + return } - b.Metering.Emit(ctx, metering.Entry{ - Kind: metering.KindVMComputeStop, VMID: vmID, SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, - }) + b.Metering.Emit(ctx, b.makeSourceEntry(metering.KindVMComputeStop, vmID, sourceSnapshotID, metering.ReasonRestore, oldShape, now)) } // emitRestoreSuccess closes old storage and opens fresh storage+compute. func (b *Backend) emitRestoreSuccess(ctx context.Context, vm *types.VM, oldShape metering.Shape, sourceSnapshotID string) { now := time.Now() - b.Metering.Emit(ctx, metering.Entry{ - Kind: metering.KindVMStorageStop, VMID: vm.ID, SourceSnapshotID: sourceSnapshotID, - Reason: metering.ReasonRestore, Hypervisor: b.Typ, Shape: oldShape, EmittedAt: now, - }) + b.Metering.Emit(ctx, b.makeSourceEntry(metering.KindVMStorageStop, vm.ID, sourceSnapshotID, metering.ReasonRestore, oldShape, now)) b.emitOpenInterval(ctx, vm, metering.ReasonRestore, sourceSnapshotID, now) } diff --git a/hypervisor/state.go b/hypervisor/state.go index d98003b5..cbaf67c8 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -106,6 +106,7 @@ func (b *Backend) UpdateStates(ctx context.Context, ids []string, state types.VM switch state { case types.VMStateRunning: r.StartedAt = &now + r.StoppedAt = nil case types.VMStateStopped: if hasOpenComputeInterval(r) { r.StoppedAt = &now @@ -144,7 +145,6 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { shape := shapeFromConfig(r.Config) if hasOpenComputeInterval(r) { emits = append(emits, b.makeEntry(metering.KindVMComputeStop, id, metering.ReasonStopCrash, shape, now)) - r.StoppedAt = &now } reason := metering.ReasonBoot if r.FirstBooted { @@ -153,6 +153,7 @@ func (b *Backend) BatchMarkStarted(ctx context.Context, ids []string) error { emits = append(emits, b.makeEntry(metering.KindVMComputeStart, id, reason, shape, now)) r.State = types.VMStateRunning r.StartedAt = &now + r.StoppedAt = nil r.UpdatedAt = now r.FirstBooted = true } @@ -181,9 +182,10 @@ func (b *Backend) CleanStalePlaceholders(_ context.Context, ids []string) error }) } -// closeStaleComputeInterval emits stop-crash and writes StoppedAt; precondition: caller confirmed the process is dead. +// closeStaleComputeInterval emits stop-crash and writes StoppedAt; precondition: caller confirmed the process is dead. Self-healing if the record vanishes (concurrent rm) or was already closed: skip emit. func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord) { now := time.Now() + closed := false if err := b.DB.Update(ctx, func(idx *VMIndex) error { r := idx.VMs[rec.ID] if r == nil || !hasOpenComputeInterval(r) { @@ -194,18 +196,19 @@ func (b *Backend) closeStaleComputeInterval(ctx context.Context, rec *VMRecord) } r.StoppedAt = &now r.UpdatedAt = now + closed = true return nil }); err != nil { log.WithFunc(b.Typ+".closeStaleComputeInterval").Warnf(ctx, "close interval for %s: %v", rec.ID, err) return } + if !closed { + return + } b.Metering.Emit(ctx, b.makeEntry(metering.KindVMComputeStop, rec.ID, metering.ReasonStopCrash, shapeFromConfig(rec.Config), now)) } -// hasOpenComputeInterval reports whether the VM still has an unclosed compute interval in the ledger. +// hasOpenComputeInterval reports whether the VM's record shows an unmatched compute.start (StoppedAt is the ledger-close sentinel; transitions to Running clear it). func hasOpenComputeInterval(r *VMRecord) bool { - if r == nil || r.StartedAt == nil { - return false - } - return r.StoppedAt == nil || r.StartedAt.After(*r.StoppedAt) + return r != nil && r.StartedAt != nil && r.StoppedAt == nil } diff --git a/hypervisor/state_test.go b/hypervisor/state_test.go index e77fca7a..08c88e9e 100644 --- a/hypervisor/state_test.go +++ b/hypervisor/state_test.go @@ -46,14 +46,14 @@ func newMeteringTestBackend(t *testing.T) (*Backend, *metering.CaptureRecorder) dir := t.TempDir() locker := flock.New(filepath.Join(dir, "index.lock")) store := storejson.New[VMIndex](filepath.Join(dir, "index.json"), locker) - cap := &metering.CaptureRecorder{} + rec := &metering.CaptureRecorder{} return &Backend{ Typ: "test-hv", Conf: stubBackendConfig{}, DB: store, Locker: locker, - Metering: cap, - }, cap + Metering: rec, + }, rec } func seedVMRecord(t *testing.T, b *Backend, id string, cpu int, mem, storage int64, firstBooted bool) { @@ -74,14 +74,14 @@ func seedVMRecord(t *testing.T, b *Backend, id string, cpu int, mem, storage int } func TestBatchMarkStartedEmitsComputeStart(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 2, 4<<30, 10<<30, false) if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { t.Fatalf("BatchMarkStarted: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 { t.Fatalf("got %d entries, want 1", len(entries)) } @@ -101,42 +101,42 @@ func TestBatchMarkStartedEmitsComputeStart(t *testing.T) { } func TestBatchMarkStartedReasonRestartWhenAlreadyBooted(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) if err := b.BatchMarkStarted(ctx, []string{"vm1"}); err != nil { t.Fatalf("BatchMarkStarted: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Reason != metering.ReasonRestart { t.Errorf("got %+v, want one entry with reason restart", entries) } } func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 1, 1<<30, 10<<30, true) if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped from created): %v", err) } - if got := cap.Entries(); len(got) != 0 { + if got := rec.Entries(); len(got) != 0 { t.Errorf("Created→Stopped emitted %d; want 0", len(got)) } if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { t.Fatalf("UpdateStates(running): %v", err) } - if got := cap.Entries(); len(got) != 0 { + if got := rec.Entries(); len(got) != 0 { t.Errorf("Stopped→Running emitted %d; want 0", len(got)) } if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped): %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopUser { t.Fatalf("Running→Stopped: got %+v, want one compute.stop reason=user", entries) } @@ -144,18 +144,18 @@ func TestUpdateStatesEmitsOnlyOnRunningToStopped(t *testing.T) { if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { t.Fatalf("UpdateStates(running again): %v", err) } - cap.Reset() + rec.Reset() if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { t.Fatalf("UpdateStates(error): %v", err) } - if got := cap.Entries(); len(got) != 0 { + if got := rec.Entries(); len(got) != 0 { t.Errorf("Running→Error must not emit; got %d entries", len(got)) } } func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { // Running→Error must leave the interval open (UpdateStates(Error) doesn't write StoppedAt). The next PrepareStart confirms the process is dead and closes the interval. - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) dir := t.TempDir() @@ -168,7 +168,7 @@ func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { } b.MarkError(ctx, "vm1") - if got := cap.Entries(); len(got) != 0 { + if got := rec.Entries(); len(got) != 0 { t.Fatalf("MarkError emitted %d; want 0", len(got)) } loaded, _ := b.LoadRecord(ctx, "vm1") @@ -176,14 +176,14 @@ func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { t.Errorf("MarkError must not write StoppedAt; got %v", loaded.StoppedAt) } - rec, err := b.PrepareStart(ctx, "vm1", nil) + prep, err := b.PrepareStart(ctx, "vm1", nil) if err != nil { t.Fatalf("PrepareStart: %v", err) } - if rec == nil { + if prep == nil { t.Fatal("PrepareStart returned nil") } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopCrash { t.Fatalf("got %+v, want one compute.stop reason=stop-crash", entries) } @@ -191,16 +191,16 @@ func TestPrepareStartClosesIntervalAfterMarkError(t *testing.T) { func TestStopAfterMarkErrorEmitsComputeStop(t *testing.T) { // Running→Error→Stopped: MarkError leaves the interval open; the recovery stop confirms the process is dead and must close it. - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) b.MarkError(ctx, "vm1") - cap.Reset() + rec.Reset() if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateStopped); err != nil { t.Fatalf("UpdateStates(stopped): %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopUser { t.Fatalf("got %+v, want one compute.stop reason=user", entries) } @@ -212,11 +212,11 @@ func TestStopAfterMarkErrorEmitsComputeStop(t *testing.T) { func TestDeleteForceClosesIntervalAfterMarkError(t *testing.T) { // rm --force on an Error VM with a still-open interval must emit compute.stop, not just storage.stop. - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) b.MarkError(ctx, "vm1") - cap.Reset() + rec.Reset() loaded, _ := b.LoadRecord(ctx, "vm1") if !hasOpenComputeInterval(&loaded) { @@ -224,14 +224,14 @@ func TestDeleteForceClosesIntervalAfterMarkError(t *testing.T) { } b.emitDeleteClose(ctx, "vm1", shapeFromConfig(loaded.Config), metering.ReasonStopCrash, hasOpenComputeInterval(&loaded)) - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 2 { t.Fatalf("got %d entries, want 2 (compute.stop + storage.stop)", len(entries)) } } func TestPrepareStartClosesStaleInterval(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) dir := t.TempDir() @@ -242,16 +242,16 @@ func TestPrepareStartClosesStaleInterval(t *testing.T) { }); err != nil { t.Fatalf("set dirs: %v", err) } - cap.Reset() + rec.Reset() - rec, err := b.PrepareStart(ctx, "vm1", nil) + prep, err := b.PrepareStart(ctx, "vm1", nil) if err != nil { t.Fatalf("PrepareStart: %v", err) } - if rec == nil { + if prep == nil { t.Fatal("PrepareStart returned nil (treated as already-running)") } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Kind != metering.KindVMComputeStop || entries[0].Reason != metering.ReasonStopCrash { t.Fatalf("got %+v, want one compute.stop reason=stop-crash", entries) } @@ -268,7 +268,7 @@ func TestPrepareStartClosesStaleInterval(t *testing.T) { } func TestFinalizeCloneEmitsCloneEntries(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false) @@ -281,7 +281,7 @@ func TestFinalizeCloneEmitsCloneEntries(t *testing.T) { if err := b.FinalizeClone(ctx, "vm1", info, nil, nil, "snap-source"); err != nil { t.Fatalf("FinalizeClone: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 2 { t.Fatalf("got %d entries, want 2 (storage.start + compute.start)", len(entries)) } @@ -312,7 +312,7 @@ func seedRunningVM(t *testing.T, b *Backend, id string, cpu int, mem, storage in } func TestDirectRestoreSequenceEmitsComputeStopThenTransition(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) @@ -332,7 +332,7 @@ func TestDirectRestoreSequenceEmitsComputeStopThenTransition(t *testing.T) { t.Fatalf("DirectRestoreSequence: %v", err) } - entries := cap.Entries() + entries := rec.Entries() // compute.stop on kill; storage.stop + storage.start + compute.start on success. if len(entries) != 4 { t.Fatalf("got %d entries, want 4", len(entries)) @@ -368,7 +368,7 @@ func TestDirectRestoreSequenceEmitsComputeStopThenTransition(t *testing.T) { func TestDirectRestoreSequenceEmitsOnlyComputeStopOnPopulateFailure(t *testing.T) { // Storage must stay open when restore fails after kill — the on-disk files // are still the old shape and vm rm will close it later with reason vm-rm. - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedRunningVM(t, b, "vm1", 2, 2<<30, 20<<30) @@ -387,7 +387,7 @@ func TestDirectRestoreSequenceEmitsOnlyComputeStopOnPopulateFailure(t *testing.T if _, err := b.DirectRestoreSequence(ctx, "vm1", spec); err == nil { t.Fatal("expected error from populate failure") } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 { t.Fatalf("got %d entries, want 1 (compute.stop only; storage stays open)", len(entries)) } @@ -404,7 +404,7 @@ func TestStartAllOnlyEmitsForActuallyLaunched(t *testing.T) { // - vm-stale: DB Running, process dead, relaunched → launched=true → emit // The bug being locked down: an earlier impl had BatchMarkStarted skip // anything with r.State==Running, which silently dropped vm-stale. - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm-stopped", 1, 1<<30, 10<<30, false) seedRunningVM(t, b, "vm-running", 1, 1<<30, 10<<30) @@ -428,7 +428,7 @@ func TestStartAllOnlyEmitsForActuallyLaunched(t *testing.T) { t.Errorf("succeeded %v, want 3", succeeded) } - entries := cap.Entries() + entries := rec.Entries() // vm-stopped → 1 entry (compute.start) // vm-running → 0 entries (no-op) // vm-stale → 2 entries (compute.stop reason=stop-crash + compute.start reason=restart) @@ -458,7 +458,7 @@ func TestStartAllOnlyEmitsForActuallyLaunched(t *testing.T) { } func TestFinalizeCreateEmitsStorageStart(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() // FinalizeCreate requires an existing placeholder. seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, false) @@ -471,7 +471,7 @@ func TestFinalizeCreateEmitsStorageStart(t *testing.T) { if err := b.FinalizeCreate(ctx, "vm1", info, nil, nil); err != nil { t.Fatalf("FinalizeCreate: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 { t.Fatalf("got %d entries, want 1", len(entries)) } @@ -485,7 +485,7 @@ func TestFinalizeCreateEmitsStorageStart(t *testing.T) { } func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) { - b, cap := newMeteringTestBackend(t) + b, rec := newMeteringTestBackend(t) ctx := t.Context() seedVMRecord(t, b, "vm1", 2, 2<<30, 20<<30, true) if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateRunning); err != nil { @@ -494,10 +494,10 @@ func TestDeleteAfterErrorEmitsOnlyStorageStop(t *testing.T) { if err := b.UpdateStates(ctx, []string{"vm1"}, types.VMStateError); err != nil { t.Fatalf("UpdateStates(error): %v", err) } - cap.Reset() + rec.Reset() b.emitDeleteClose(ctx, "vm1", metering.Shape{CPU: 2, MemBytes: 2 << 30, StorageBytes: 20 << 30}, metering.ReasonStopCrash, false) - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 || entries[0].Kind != metering.KindVMStorageStop { t.Fatalf("post-Error delete: got %+v, want one storage.stop", entries) } diff --git a/snapshot/localfile/gc_test.go b/snapshot/localfile/gc_test.go index 7c9c3378..f81da45e 100644 --- a/snapshot/localfile/gc_test.go +++ b/snapshot/localfile/gc_test.go @@ -367,8 +367,8 @@ func TestGCModule_EvictRealRecordEmitsSnapStorageStop(t *testing.T) { } } - cap := &metering.CaptureRecorder{} - mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}, cap) + rec := &metering.CaptureRecorder{} + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{Enabled: true}, rec) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatal(err) @@ -381,7 +381,7 @@ func TestGCModule_EvictRealRecordEmitsSnapStorageStop(t *testing.T) { t.Fatal(err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 2 { t.Fatalf("got %d entries, want 2 (one stop per evicted record)", len(entries)) } @@ -424,8 +424,8 @@ func TestGCModule_OrphanAndStalePendingDoNotEmit(t *testing.T) { t.Fatal(err) } - cap := &metering.CaptureRecorder{} - mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}, cap) + rec := &metering.CaptureRecorder{} + mod := gcModule(lf.conf, lf.store, lf.locker, EvictionPolicy{}, rec) snap, err := mod.ReadDB(ctx) if err != nil { t.Fatal(err) @@ -437,7 +437,7 @@ func TestGCModule_OrphanAndStalePendingDoNotEmit(t *testing.T) { if err := mod.Collect(ctx, ids, snap); err != nil { t.Fatal(err) } - if got := cap.Entries(); len(got) != 0 { + if got := rec.Entries(); len(got) != 0 { t.Errorf("got %d entries; orphan/stale-pending must not emit stop", len(got)) } } diff --git a/snapshot/localfile/localfile_test.go b/snapshot/localfile/localfile_test.go index 2e072198..f2ea4c58 100644 --- a/snapshot/localfile/localfile_test.go +++ b/snapshot/localfile/localfile_test.go @@ -90,8 +90,8 @@ func TestNew_NilConfig(t *testing.T) { // Create func TestCreateAndDeleteEmitMetering(t *testing.T) { - cap := &metering.CaptureRecorder{} - lf := newTestLFWithRecorder(t, cap) + rec := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, rec) ctx := t.Context() id, err := lf.Create(ctx, &types.SnapshotConfig{ @@ -103,7 +103,7 @@ func TestCreateAndDeleteEmitMetering(t *testing.T) { t.Fatalf("Create: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 { t.Fatalf("after Create: got %d entries, want 1", len(entries)) } @@ -115,7 +115,7 @@ func TestCreateAndDeleteEmitMetering(t *testing.T) { if _, err := lf.Delete(ctx, []string{id}); err != nil { t.Fatalf("Delete: %v", err) } - entries = cap.Entries() + entries = rec.Entries() if len(entries) != 2 { t.Fatalf("after Delete: got %d entries, want 2", len(entries)) } @@ -132,8 +132,8 @@ func TestDeleteOneIdempotentDoesNotEmitTwice(t *testing.T) { // phantom snap.storage.stop with an empty Hypervisor field. We exercise this // by calling deleteOne twice on the same id (idempotent), simulating the // loser running its loop body after the winner already committed. - cap := &metering.CaptureRecorder{} - lf := newTestLFWithRecorder(t, cap) + rec := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, rec) ctx := t.Context() id, err := lf.Create(ctx, &types.SnapshotConfig{ @@ -152,7 +152,7 @@ func TestDeleteOneIdempotentDoesNotEmitTwice(t *testing.T) { // Ledger should hold exactly 2 entries: Create's start and the FIRST // deleteOne's stop. The second call must not contribute a phantom event. - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 2 { t.Fatalf("got %d entries, want 2 (start + 1× stop); kinds = %v", len(entries), kinds(entries)) } @@ -176,8 +176,8 @@ func kinds(entries []metering.Entry) []metering.Kind { } func TestImportEmitsSnapStorageStart(t *testing.T) { - cap := &metering.CaptureRecorder{} - lf := newTestLFWithRecorder(t, cap) + rec := &metering.CaptureRecorder{} + lf := newTestLFWithRecorder(t, rec) ctx := t.Context() envelope, err := snapshot.MarshalEnvelope(types.SnapshotConfig{ @@ -198,7 +198,7 @@ func TestImportEmitsSnapStorageStart(t *testing.T) { t.Fatalf("Import: %v", err) } - entries := cap.Entries() + entries := rec.Entries() if len(entries) != 1 { t.Fatalf("got %d entries, want 1 (snap.storage.start)", len(entries)) }