From c45516252ffa5be085f6f0ddd1f72b76b5fb567e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 20:23:57 +0000 Subject: [PATCH] feat(node): periodic Reaper to GC orphaned VM resources An unclean VM death (host OOM-kill, `:erlang.halt`, a crash before the FireVMM relaunch-time cleanup runs) can strand a firecracker cgroup leaf and a `hyper-rw-*` thin volume whose vm_id never reboots, so nothing ever clears them. Add `Hyper.Node.Reaper`, a periodic liveness-aware GC started last under `Hyper.Node`. Each tick it lists the cgroup leaves under the parent cgroup and the `hyper-rw-*` dm volumes, subtracts the VMs this node still runs (via the VM supervisor and the routing CRDT), and -- only after a leaf shows up orphaned on two consecutive ticks (`Plan.confirm/2`, so a VM mid-boot is never reaped) -- removes the chroot/cgroup via the helper and the dm volume. Entirely best-effort: every failure is logged and the next tick retries. The decision logic is the pure `Hyper.Node.Reaper.Plan` (rw_ids/orphans/confirm), covered by example + property tests; the GenServer is a thin shell around it. The tick interval is a fixed constant -- the two-strike confirmation already makes the exact value non-load-bearing, so nothing here is configurable. Also corrects `Jailer.cgroup_dir/1` to `/` (no `` level): the jailer (cgroup v2, `--parent-cgroup`) places firecracker directly under the parent cgroup, confirmed via `/proc//cgroup`. This also fixes the existing FireVMM teardown (`daemon.ex`), which removes the same leaf, and is what lets the Reaper enumerate leaves by listing the parent dir. Extracted from the VM-boot branch (chore/get-a-vm-running). --- lib/hyper/node.ex | 10 +- lib/hyper/node/fire_vmm/jailer.ex | 22 ++- lib/hyper/node/reaper.ex | 164 ++++++++++++++++++ lib/hyper/node/reaper/plan.ex | 32 ++++ .../node/reaper/plan_properties_test.exs | 78 +++++++++ test/hyper/node/reaper/plan_test.exs | 65 +++++++ 6 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 lib/hyper/node/reaper.ex create mode 100644 lib/hyper/node/reaper/plan.ex create mode 100644 test/hyper/node/reaper/plan_properties_test.exs create mode 100644 test/hyper/node/reaper/plan_test.exs diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 58b7c1e5..49c12cd7 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -26,6 +26,11 @@ defmodule Hyper.Node do real-time monitors backing the soft budget (`Hyper.Node.Budget.Soft`). Lives here, not at the application root, because both are per-node and only meaningful while this node hosts VMs. + + * `Hyper.Node.Reaper` - a periodic, liveness-aware GC for per-VM host + resources (orphaned firecracker cgroups and `hyper-rw-*` dm volumes) stranded + by an unclean death whose vm_id never reboots. Started last so the VM + supervisor it consults for liveness is already up. """ use Supervisor @@ -61,7 +66,10 @@ defmodule Hyper.Node do Hyper.Node.Layer, Hyper.Node.Budget.Supervisor, {DynamicSupervisor, name: @vm_sup, strategy: :one_for_one}, - Hyper.Node.Img + Hyper.Node.Img, + # Last child: :one_for_one starts in order, so the VM supervisor and Img are + # up before the reaper's first tick can read their liveness. + Hyper.Node.Reaper ] Supervisor.init(children, strategy: :one_for_one) diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex index ace7a553..c9df6247 100644 --- a/lib/hyper/node/fire_vmm/jailer.ex +++ b/lib/hyper/node/fire_vmm/jailer.ex @@ -137,13 +137,29 @@ defmodule Hyper.Node.FireVMM.Jailer do end @doc """ - Host path of the VM's cgroup leaf (`/sys/fs/cgroup///`), the - cgroup the jailer creates for firecracker. Reconstructed (the jailer owns its + Host path of the cgroup dir holding every VM's leaf (`/sys/fs/cgroup/`). + Its immediate subdir names are vm_ids - the leaves the jailer creates for + firecracker - so listing it (directories only; the dir also holds cgroup control + files) enumerates the cgroup leaves on this host. + + Note the cgroup hierarchy has NO `` level: the jailer (cgroup v2, + `--parent-cgroup `) places firecracker directly at `/`, + unlike the chroot (`//`). Confirmed via + `/proc//cgroup` = `0:://`. + """ + @spec cgroup_parent_dir() :: Path.t() + def cgroup_parent_dir do + Path.join("/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup()) + end + + @doc """ + Host path of the VM's cgroup leaf (`/sys/fs/cgroup//`), the cgroup + the jailer creates for firecracker. Reconstructed (the jailer owns its placement) so a relaunch can clear the stale leaf left by a prior incarnation. """ @spec cgroup_dir(Hyper.Vm.Id.t()) :: Path.t() def cgroup_dir(id) do - Path.join(["/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup(), exec_name(), id]) + Path.join(cgroup_parent_dir(), id) end @doc """ diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex new file mode 100644 index 00000000..4970f426 --- /dev/null +++ b/lib/hyper/node/reaper.ex @@ -0,0 +1,164 @@ +defmodule Hyper.Node.Reaper do + @moduledoc """ + Per-node periodic, liveness-aware garbage collector for per-VM host resources + that an unclean BEAM death can strand: a firecracker cgroup leaf and a + `hyper-rw-` dm volume whose owning processes' `terminate/2` never ran and + whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and + the relaunch-time cleanup in the FireVMM path, never get a chance to clear it). + + Liveness is the whole point. The reaper consults two independent sources of + truth for "this vm is alive" (`Plan.orphans/3` removes their union from the + candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup + leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A + candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it + is reaped, so a VM caught mid-boot (resources present, not yet registered) is + given a grace tick rather than destroyed. + + The decision logic lives in the pure `Hyper.Node.Reaper.Plan`; this module is a + thin I/O adapter that gathers the inputs, calls the plan, and executes the + best-effort, idempotent removals. + """ + + use GenServer + use OpenTelemetryDecorator + require Logger + + alias Hyper.Cluster.Routing + alias Hyper.Node.FireVMM.Jailer + alias Hyper.Node.Img + alias Hyper.Node.Reaper.Plan + alias Hyper.SuidHelper.{ChrootJail, Dmsetup} + + @vm_sup Hyper.Node.VMSupervisor + + # Rest between reap ticks. Deliberately not configurable: the two-strike + # confirmation (`Plan.confirm/2`) already means an orphan is reaped at most one + # interval after it is first seen, so the exact value is not load-bearing. + @interval Unit.Time.s(60) + + defstruct last_orphans: MapSet.new() + + @type t :: %__MODULE__{last_orphans: MapSet.t(String.t())} + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts, name: __MODULE__) + end + + @impl true + def init(_opts) do + schedule() + {:ok, %__MODULE__{}} + end + + @impl true + def handle_info(:tick, state) do + state = sweep(state) + schedule() + {:noreply, state} + end + + # Ignore any unexpected message (port noise, stale timers) rather than crashing. + def handle_info(_msg, state), do: {:noreply, state} + + @spec schedule() :: reference() + defp schedule do + Process.send_after(self(), :tick, Unit.Time.as_ms(@interval)) + end + + @spec sweep(t()) :: t() + @decorate with_span("Hyper.Node.Reaper.sweep") + defp sweep(%__MODULE__{} = state) do + live = gather_live() + leaves = list_cgroup_leaves() + rw = Plan.rw_ids(list_rw_dm()) + + current = Plan.orphans(live, leaves, rw) + {to_reap, next} = Plan.confirm(current, state.last_orphans) + + Enum.each(to_reap, &reap_one/1) + %{state | last_orphans: next} + end + + # Over-counting "live" only defers a reap (safe); under-counting destroys a live + # VM (catastrophic). So union two independent liveness sources: the local VM + # supervisor's children and the cluster routing table's view of this node. + @spec gather_live() :: MapSet.t(String.t()) + defp gather_live, do: MapSet.union(local_live(), routed_live()) + + @spec local_live() :: MapSet.t(String.t()) + defp local_live do + case Process.whereis(@vm_sup) do + nil -> + MapSet.new() + + _pid -> + @vm_sup + |> DynamicSupervisor.which_children() + |> Enum.map(fn {_, child, _, _} -> child end) + |> Enum.filter(&is_pid/1) + |> Enum.map(&Routing.id_for/1) + |> Enum.reject(&is_nil/1) + |> MapSet.new() + end + end + + @spec routed_live() :: MapSet.t(String.t()) + defp routed_live do + for {id, node} <- Routing.all(), node == node(), into: MapSet.new(), do: id + end + + @spec list_cgroup_leaves() :: [String.t()] + defp list_cgroup_leaves do + parent = Jailer.cgroup_parent_dir() + + case File.ls(parent) do + {:ok, names} -> + # The parent holds the per-VM leaf directories alongside cgroup control + # files (`cgroup.procs`, `cgroup.controllers`, ...); only the directories + # are vm_id leaves. + Enum.filter(names, &File.dir?(Path.join(parent, &1))) + + {:error, :enoent} -> + [] + + {:error, reason} -> + Logger.warning("reaper: could not list cgroup leaves: #{inspect(reason)}") + [] + end + end + + @spec list_rw_dm() :: [String.t()] + defp list_rw_dm do + case Dmsetup.list() do + {:ok, names} -> + names + + {:error, reason} -> + Logger.warning("reaper: could not list dm devices: #{inspect(reason)}") + [] + end + end + + @spec reap_one(String.t()) :: :ok + @decorate with_span("Hyper.Node.Reaper.reap_one", include: [:id]) + defp reap_one(id) do + Logger.warning("reaper: reaping orphan vm #{id}") + + log_result( + "chroot/cgroup", + id, + ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) + ) + + log_result("dm volume", id, Dmsetup.remove(Img.Mutable.dm_name(id))) + :ok + end + + @spec log_result(String.t(), String.t(), :ok | {:error, term()}) :: :ok + defp log_result(_what, _id, :ok), do: :ok + + defp log_result(what, id, {:error, reason}) do + Logger.warning("reaper: removing #{what} for #{id} failed: #{inspect(reason)}") + end +end diff --git a/lib/hyper/node/reaper/plan.ex b/lib/hyper/node/reaper/plan.ex new file mode 100644 index 00000000..17b6dde9 --- /dev/null +++ b/lib/hyper/node/reaper/plan.ex @@ -0,0 +1,32 @@ +defmodule Hyper.Node.Reaper.Plan do + @moduledoc """ + Pure reap-decision core for `Hyper.Node.Reaper`. No I/O. Every safety invariant + is a property of these functions: a live vm_id is never a candidate, only an + orphan seen on two consecutive ticks is reaped, and only `hyper-rw-*` dm names + yield candidates (so `hyper-thinpool` / `hyper-img-*` can never be reaped). + """ + + @rw_prefix "hyper-rw-" + + @doc "vm_ids of orphan rw-volumes from raw `dmsetup ls` names (only `hyper-rw-*`)." + @spec rw_ids([String.t()]) :: [String.t()] + def rw_ids(dm_names) do + for name <- dm_names, + String.starts_with?(name, @rw_prefix), + do: String.replace_prefix(name, @rw_prefix, "") + end + + @doc "Candidate orphans this tick: (cgroup leaves ∪ rw vm_ids) minus the live set." + @spec orphans(MapSet.t(String.t()), [String.t()], [String.t()]) :: MapSet.t(String.t()) + def orphans(live, cgroup_leaves, rw_ids) do + cgroup_leaves + |> MapSet.new() + |> MapSet.union(MapSet.new(rw_ids)) + |> MapSet.difference(live) + end + + @doc "Two-strike grace: reap only ids that were also orphans last tick. Returns {to_reap, next_last}." + @spec confirm(MapSet.t(String.t()), MapSet.t(String.t())) :: + {MapSet.t(String.t()), MapSet.t(String.t())} + def confirm(current, last), do: {MapSet.intersection(current, last), current} +end diff --git a/test/hyper/node/reaper/plan_properties_test.exs b/test/hyper/node/reaper/plan_properties_test.exs new file mode 100644 index 00000000..6720a608 --- /dev/null +++ b/test/hyper/node/reaper/plan_properties_test.exs @@ -0,0 +1,78 @@ +defmodule Hyper.Node.Reaper.PlanPropertiesTest do + @moduledoc """ + Safety laws every `Hyper.Node.Reaper.Plan` decision must obey, generated over a + small shared id alphabet so live / cgroup-leaf / rw sets actually overlap: + + * a live vm_id is NEVER a reap candidate (the union of liveness sources is + removed from the orphan set); + * only an orphan seen on two consecutive ticks is reaped (`confirm/2` reaps a + subset of both current and last, and carries `current` forward); + * `hyper-thinpool` / `hyper-img-*` / any non-`hyper-rw-*` name never yields a + candidate, and `Mutable.dm_name(id)` round-trips back to exactly `id` (so a + future id-scheme change that breaks the strip fails loudly here). + """ + use ExUnit.Case, async: true + use ExUnitProperties + + alias Hyper.Node.Img.Mutable + alias Hyper.Node.Reaper.Plan + + # A deliberately tiny alphabet so generated live/leaf/rw id sets collide often, + # exercising the difference and intersection rather than always being disjoint. + defp id, do: member_of(~w(a b c d e)) + + defp id_set, do: map(list_of(id()), &MapSet.new/1) + + defp id_list, do: list_of(id()) + + property "a live vm_id is never a reap candidate" do + check all( + live <- id_set(), + leaves <- id_list(), + rw <- id_list() + ) do + orphans = Plan.orphans(live, leaves, rw) + assert MapSet.disjoint?(orphans, live) + end + end + + property "only twice-seen orphans are reaped; current is carried forward" do + check all( + current <- id_set(), + last <- id_set() + ) do + {reap, next} = Plan.confirm(current, last) + + assert MapSet.subset?(reap, current) + assert MapSet.subset?(reap, last) + assert next == current + end + end + + property "rw_ids excludes thinpool, img, and non-rw junk" do + safe_dm = + member_of([ + "hyper-thinpool", + "hyper-img-abc-0", + "hyper-img-deadbeef-3", + "unrelated-device", + "cryptroot" + ]) + + check all(names <- list_of(safe_dm)) do + assert Plan.rw_ids(names) == [] + end + end + + property "Mutable.dm_name/1 round-trips through rw_ids for a real vm_id" do + check all( + id <- + map( + binary(min_length: 1, max_length: 16), + &("v" <> Base.encode32(&1, padding: false, case: :lower)) + ) + ) do + assert Plan.rw_ids([Mutable.dm_name(id)]) == [id] + end + end +end diff --git a/test/hyper/node/reaper/plan_test.exs b/test/hyper/node/reaper/plan_test.exs new file mode 100644 index 00000000..0021db98 --- /dev/null +++ b/test/hyper/node/reaper/plan_test.exs @@ -0,0 +1,65 @@ +defmodule Hyper.Node.Reaper.PlanTest do + use ExUnit.Case, async: true + + alias Hyper.Node.Reaper.Plan + + defp set(ids), do: MapSet.new(ids) + + describe "orphans/3" do + test "a cgroup-leaf-only orphan is a candidate" do + assert Plan.orphans(set([]), ["dead"], []) == set(["dead"]) + end + + test "a dm-only orphan is a candidate" do + assert Plan.orphans(set([]), [], ["dead"]) == set(["dead"]) + end + + test "an id seen in both sources is a single candidate" do + assert Plan.orphans(set([]), ["dead"], ["dead"]) == set(["dead"]) + end + + test "an id present in live is never a candidate, even if it also has resources" do + assert Plan.orphans(set(["alive"]), ["alive"], ["alive"]) == set([]) + end + + test "only the non-live ids survive as candidates" do + assert Plan.orphans(set(["alive"]), ["alive", "dead"], ["alive", "gone"]) == + set(["dead", "gone"]) + end + end + + describe "confirm/2 two-strike grace" do + test "first tick reaps nothing (last is empty) but remembers the candidates" do + current = set(["x", "y"]) + {reap, next} = Plan.confirm(current, set([])) + + assert reap == set([]) + assert next == current + end + + test "second tick reaps the still-orphan ids" do + {_, last} = Plan.confirm(set(["x", "y"]), set([])) + {reap, _next} = Plan.confirm(set(["x", "y"]), last) + + assert reap == set(["x", "y"]) + end + + test "an id orphaned tick1 but live/absent tick2 is not reaped" do + {_, last} = Plan.confirm(set(["x"]), set([])) + {reap, _next} = Plan.confirm(set([]), last) + + assert reap == set([]) + end + + test "an id new in tick2 is not reaped (only one strike)" do + {_, last} = Plan.confirm(set(["x"]), set([])) + {reap, _next} = Plan.confirm(set(["x", "fresh"]), last) + + assert reap == set(["x"]) + end + end + + test "rw_ids/1 strips the hyper-rw- prefix and ignores thinpool/img names" do + assert Plan.rw_ids(["hyper-thinpool", "hyper-img-abc-0", "hyper-rw-vabc"]) == ["vabc"] + end +end