From a3c7757a4a130ffaa94b5805d7d94970ec0ebad0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 22:37:18 +0000 Subject: [PATCH 1/6] fix(img): refcounted idle-reaping servers must be restart: :temporary A :permanent DynamicSupervisor child is restarted on its intentional idle {:stop, :normal}; init/1 then re-creates the dm resource terminate/2 just destroyed, leaking and resurrecting hyper-rw- volumes and image/layer dm chains. Make Img.Mutable, Img.Server and Layer.Server :temporary so idle-teardown is final. --- lib/hyper/node/img/mutable.ex | 2 +- lib/hyper/node/img/server.ex | 2 +- lib/hyper/node/layer/server.ex | 2 +- test/hyper/node/refcounted_restart_test.exs | 23 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 test/hyper/node/refcounted_restart_test.exs diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index c1c8445..43244ff 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -16,7 +16,7 @@ defmodule Hyper.Node.Img.Mutable do down the RO chain in turn). """ - use GenServer + use GenServer, restart: :temporary alias Hyper.Node.Img alias Hyper.Node.Img.{Server, ThinPool} diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex index 9975a7d..8d4f5aa 100644 --- a/lib/hyper/node/img/server.ex +++ b/lib/hyper/node/img/server.ex @@ -12,7 +12,7 @@ defmodule Hyper.Node.Img.Server do and releasing its layers (which then unmount once nothing else holds them). """ - use GenServer + use GenServer, restart: :temporary require Logger alias Hyper.Img.Db diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex index 05fece6..0ff7c1b 100644 --- a/lib/hyper/node/layer/server.ex +++ b/lib/hyper/node/layer/server.ex @@ -9,7 +9,7 @@ defmodule Hyper.Node.Layer.Server do grace period keeps bursty acquire/release cycles from thrashing the mount. """ - use GenServer + use GenServer, restart: :temporary require Logger alias Hyper.Node.Layer diff --git a/test/hyper/node/refcounted_restart_test.exs b/test/hyper/node/refcounted_restart_test.exs new file mode 100644 index 0000000..6072a06 --- /dev/null +++ b/test/hyper/node/refcounted_restart_test.exs @@ -0,0 +1,23 @@ +defmodule Hyper.Node.RefcountedRestartTest do + use ExUnit.Case, async: true + + # These three servers are monitor-refcounted and idle-reap: when their last + # holder goes away they self-terminate with `{:stop, :normal}`, destroying an + # external device-mapper resource in `terminate/2`. Under a DynamicSupervisor a + # `:permanent` child is RESTARTED on that intentional `:normal` exit, and its + # `init/1` re-creates the resource it just tore down -- an endless resurrection + # loop that leaks dm devices. They MUST be `:temporary` so idle-teardown is + # final. This pins that invariant for every server in the refcount tier and for + # any new one added later. + @idle_reaping_servers [ + Hyper.Node.Img.Mutable, + Hyper.Node.Img.Server, + Hyper.Node.Layer.Server + ] + + for mod <- @idle_reaping_servers do + test "#{inspect(mod)} is restart: :temporary so its idle :stop is not undone" do + assert unquote(mod).child_spec([]).restart == :temporary + end + end +end From 19dd8eaac9e3439ca82761d272b3855feac4384d Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 22:49:01 +0000 Subject: [PATCH 2/6] feat(img): register mutable layers by vm_id; expose active_vm_ids/0 Adds a per-node Registry (Hyper.Node.Img.MutableRegistry) keyed by vm_id and names each Img.Mutable through it, giving a one-mutable-per-vm invariant and a cheap, non-blocking list of live mutable vm_ids for the Reaper to consult. --- lib/hyper/node/img.ex | 5 +++ lib/hyper/node/img/mutable.ex | 11 ++++- test/hyper/node/img/mutable_test.exs | 66 ++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 test/hyper/node/img/mutable_test.exs diff --git a/lib/hyper/node/img.ex b/lib/hyper/node/img.ex index 53bf2ae..694719a 100644 --- a/lib/hyper/node/img.ex +++ b/lib/hyper/node/img.ex @@ -20,6 +20,7 @@ defmodule Hyper.Node.Img do alias Hyper.Node.Img.ThinPool @registry Hyper.Node.Img.Registry + @mutable_registry Hyper.Node.Img.MutableRegistry @server_sup Hyper.Node.Img.Supervisor @mutable_sup Hyper.Node.Img.MutableSupervisor @@ -31,6 +32,7 @@ defmodule Hyper.Node.Img do def init(_opts) do children = [ {Registry, keys: :unique, name: @registry}, + {Registry, keys: :unique, name: @mutable_registry}, ThinPool, {DynamicSupervisor, strategy: :one_for_one, name: @server_sup}, {DynamicSupervisor, strategy: :one_for_one, name: @mutable_sup} @@ -42,6 +44,9 @@ defmodule Hyper.Node.Img do @doc false def registry, do: @registry + @doc false + def mutable_registry, do: @mutable_registry + @doc "Activate `img_id` on this node: start (or reuse) its image server." @spec activate(Hyper.Img.id()) :: {:ok, pid()} | {:error, term()} def activate(img_id) do diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index 43244ff..15d4a16 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -47,7 +47,8 @@ defmodule Hyper.Node.Img.Mutable do end @spec start_link(Opts.t()) :: GenServer.on_start() - def start_link(%Opts{} = opts), do: GenServer.start_link(__MODULE__, opts) + def start_link(%Opts{vm_id: vm_id} = opts), + do: GenServer.start_link(__MODULE__, opts, name: via(vm_id)) @spec blk_path(GenServer.server()) :: Path.t() def blk_path(server), do: GenServer.call(server, :blk_path) @@ -146,6 +147,14 @@ defmodule Hyper.Node.Img.Mutable do @spec dm_name(Hyper.Vm.Id.t()) :: String.t() def dm_name(vm_id), do: "hyper-rw-#{sanitize(vm_id)}" + @doc "vm_ids of every live mutable layer on this node." + @spec active_vm_ids() :: [Hyper.Vm.Id.t()] + def active_vm_ids do + Registry.select(Img.mutable_registry(), [{{:"$1", :_, :_}, [], [:"$1"]}]) + end + + defp via(vm_id), do: {:via, Registry, {Img.mutable_registry(), vm_id}} + defp sanitize(id), do: String.replace(id, ~r/[^A-Za-z0-9._-]/, "_") @spec drop(State.t(), pid()) :: State.t() diff --git a/test/hyper/node/img/mutable_test.exs b/test/hyper/node/img/mutable_test.exs new file mode 100644 index 0000000..d7ec4ea --- /dev/null +++ b/test/hyper/node/img/mutable_test.exs @@ -0,0 +1,66 @@ +defmodule Hyper.Node.Img.MutableTest do + use ExUnit.Case, async: false + + alias Hyper.Node.Img + alias Hyper.Node.Img.Mutable + + setup do + # The full app does not start in the test env, so stand up just the registry + # the mutable layers register into. Reuse the real name so we exercise the + # exact lookup `active_vm_ids/0` performs. + name = Img.mutable_registry() + + unless Process.whereis(name) do + start_supervised!({Registry, keys: :unique, name: name}) + end + + :ok + end + + defp register_live(vm_id) do + test = self() + + {:ok, pid} = + Task.start_link(fn -> + {:ok, _} = Registry.register(Img.mutable_registry(), vm_id, nil) + send(test, {:registered, vm_id}) + Process.sleep(:infinity) + end) + + assert_receive {:registered, ^vm_id} + pid + end + + test "active_vm_ids lists the vm_ids of every live mutable layer" do + register_live("vm-a") + register_live("vm-b") + + assert Enum.sort(Mutable.active_vm_ids()) == ["vm-a", "vm-b"] + end + + test "active_vm_ids drops a vm_id once its mutable layer dies" do + pid = register_live("vm-gone") + ref = Process.monitor(pid) + Process.unlink(pid) + Process.exit(pid, :kill) + assert_receive {:DOWN, ^ref, :process, ^pid, _} + + # Registry removes the entry on the registered process's :DOWN. Poll briefly + # so the test is robust to that async unregister without a fixed sleep. + assert eventually(fn -> Mutable.active_vm_ids() == [] end) + end + + defp eventually(fun, attempts \\ 50) do + cond do + fun.() -> + true + + attempts == 0 -> + false + + true -> + Process.sleep(2) + eventually(fun, attempts - 1) + end + end +end From b26f66d81b03858221ca8722d5fb7a3af0b79d4a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 22:54:48 +0000 Subject: [PATCH 3/6] fix(img): add @spec to mutable_registry/0 --- lib/hyper/node/img.ex | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/hyper/node/img.ex b/lib/hyper/node/img.ex index 694719a..db19e99 100644 --- a/lib/hyper/node/img.ex +++ b/lib/hyper/node/img.ex @@ -45,6 +45,7 @@ defmodule Hyper.Node.Img do def registry, do: @registry @doc false + @spec mutable_registry() :: atom() def mutable_registry, do: @mutable_registry @doc "Activate `img_id` on this node: start (or reuse) its image server." From 2a13f3051c6e59ad94d3b4c418fb9d841899bac7 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 22:57:41 +0000 Subject: [PATCH 4/6] feat(reaper): treat live mutable layers as live when reaping orphans The reaper's liveness set was VMSupervisor children union cluster Routing, which omits a vm whose mutable layer is alive but whose VM is mid-boot (not yet routed) or in its post-stop idle grace. Union Img.Mutable.active_vm_ids/0 so the reaper never removes a hyper-rw volume that a live owner still holds. --- lib/hyper/node/reaper.ex | 15 +++++++-- test/hyper/node/reaper/liveness_test.exs | 40 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 test/hyper/node/reaper/liveness_test.exs diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex index 4970f42..75ae021 100644 --- a/lib/hyper/node/reaper.ex +++ b/lib/hyper/node/reaper.ex @@ -81,10 +81,19 @@ defmodule Hyper.Node.Reaper do end # Over-counting "live" only defers a reap (safe); under-counting destroys a live - # VM (catastrophic). So union two independent liveness sources: the local VM - # supervisor's children and the cluster routing table's view of this node. + # VM (catastrophic). Union three independent liveness sources: the local VM + # supervisor's children, the cluster routing table's view of this node, and the + # live mutable layers (which own the hyper-rw volumes and exist during the + # mid-boot and idle-grace windows when a vm is in neither of the other two). @spec gather_live() :: MapSet.t(String.t()) - defp gather_live, do: MapSet.union(local_live(), routed_live()) + defp gather_live do + local_live() + |> MapSet.union(routed_live()) + |> MapSet.union(mutable_live()) + end + + @spec mutable_live() :: MapSet.t(String.t()) + defp mutable_live, do: MapSet.new(Img.Mutable.active_vm_ids()) @spec local_live() :: MapSet.t(String.t()) defp local_live do diff --git a/test/hyper/node/reaper/liveness_test.exs b/test/hyper/node/reaper/liveness_test.exs new file mode 100644 index 0000000..7033e0e --- /dev/null +++ b/test/hyper/node/reaper/liveness_test.exs @@ -0,0 +1,40 @@ +defmodule Hyper.Node.Reaper.LivenessTest do + use ExUnit.Case, async: false + + alias Hyper.Node.Img + alias Hyper.Node.Img.Mutable + alias Hyper.Node.Reaper.Plan + + setup do + name = Img.mutable_registry() + + unless Process.whereis(name) do + start_supervised!({Registry, keys: :unique, name: name}) + end + + :ok + end + + defp register_live(vm_id) do + test = self() + + {:ok, _pid} = + Task.start_link(fn -> + {:ok, _} = Registry.register(Img.mutable_registry(), vm_id, nil) + send(test, {:registered, vm_id}) + Process.sleep(:infinity) + end) + + assert_receive {:registered, ^vm_id} + end + + test "a vm with a live mutable layer is never an orphan, even with a leftover rw device" do + register_live("vm-live") + + live = MapSet.new(Mutable.active_vm_ids()) + + # The reaper would see hyper-rw-vm-live in `dmsetup ls` (rw candidate) and no + # cgroup leaf, yet the live mutable owner must protect it from reaping. + assert Plan.orphans(live, [], ["vm-live"]) == MapSet.new([]) + end +end From c432ea0cc96b159adace9866a656c2e25440c423 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 23:05:51 +0000 Subject: [PATCH 5/6] docs(reaper): correct liveness-source count; note intentional create_mutable/2 refusal and liveness test scope --- lib/hyper/node/img.ex | 3 +++ lib/hyper/node/reaper.ex | 10 ++++++---- test/hyper/node/reaper/liveness_test.exs | 3 +++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/hyper/node/img.ex b/lib/hyper/node/img.ex index db19e99..df242ea 100644 --- a/lib/hyper/node/img.ex +++ b/lib/hyper/node/img.ex @@ -61,6 +61,9 @@ defmodule Hyper.Node.Img do @doc "Create a per-VM mutable layer for `vm_id` over `img_id`." @spec create_mutable(Hyper.Img.id(), Hyper.Vm.Id.t()) :: {:ok, pid()} | {:error, term()} def create_mutable(img_id, vm_id) do + # Unlike activate/1, we intentionally do NOT map {:already_started, pid} -> {:ok, pid}: + # vm_ids are unique per VM, so a duplicate vm_id is a bug, not a shared-server reuse. + # Surfacing {:error, {:already_started, pid}} enforces the one-mutable-per-vm invariant. case DynamicSupervisor.start_child( @mutable_sup, {Mutable, %Mutable.Opts{img_id: img_id, vm_id: vm_id}} diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex index 75ae021..9dfd87f 100644 --- a/lib/hyper/node/reaper.ex +++ b/lib/hyper/node/reaper.ex @@ -6,10 +6,12 @@ defmodule Hyper.Node.Reaper do whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and the relaunch-time cleanup in the FireVMM path, never get a chance to clear it). - Liveness is the whole point. The reaper consults two independent sources of - truth for "this vm is alive" (`Plan.orphans/3` removes their union from the - candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup - leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A + Liveness is the whole point. The reaper consults three independent sources of + truth for "this vm is alive" — the local VM supervisor's children, the cluster + routing table, and the node's live mutable layers (`Img.Mutable.active_vm_ids/0`) + — (`Plan.orphans/3` removes their union from the candidate set) and only ever + touches `hyper-rw-*` dm names and per-VM cgroup leaves - never `hyper-thinpool`, + `hyper-img-*`, or a live VM's resources. A candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it is reaped, so a VM caught mid-boot (resources present, not yet registered) is given a grace tick rather than destroyed. diff --git a/test/hyper/node/reaper/liveness_test.exs b/test/hyper/node/reaper/liveness_test.exs index 7033e0e..e9068dc 100644 --- a/test/hyper/node/reaper/liveness_test.exs +++ b/test/hyper/node/reaper/liveness_test.exs @@ -1,4 +1,7 @@ defmodule Hyper.Node.Reaper.LivenessTest do + # Guards the Img.Mutable.active_vm_ids/0 + Plan.orphans/3 contract: a live mutable + # layer protects its vm_id from reaping. The gather_live/0 union of this source into + # the full live set is verified by manual live-node testing (Task 4), not CI. use ExUnit.Case, async: false alias Hyper.Node.Img From c2ef242de0704dec0069663c16a9f39b2559f87c Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Wed, 1 Jul 2026 04:09:04 +0000 Subject: [PATCH 6/6] docs(node): note resource-lifetime/reconciliation architecture The restart: :temporary requirement on the refcounted layer servers is a symptom of coupling external-resource lifetime to BEAM-process lifetime via terminate/2. Add a TODO(arch) at the reaper sweep (the reconciler-in-waiting) proposing reconciliation as the primary cleanup mechanism, plus a why-note at each :temporary site pointing to it. --- lib/hyper/node/img/mutable.ex | 4 ++++ lib/hyper/node/img/server.ex | 4 ++++ lib/hyper/node/layer/server.ex | 4 ++++ lib/hyper/node/reaper.ex | 10 ++++++++++ 4 files changed, 22 insertions(+) diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index 15d4a16..05cce3d 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -16,6 +16,10 @@ defmodule Hyper.Node.Img.Mutable do down the RO chain in turn). """ + # `:temporary` is load-bearing: on idle this server destroys its per-VM thin + # volume in `terminate/2`, so a `:permanent` restart would resurrect the dm + # device it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper` + # for why coupling resource lifetime to process lifetime is a smell. use GenServer, restart: :temporary alias Hyper.Node.Img diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex index 8d4f5aa..8fd7f0b 100644 --- a/lib/hyper/node/img/server.ex +++ b/lib/hyper/node/img/server.ex @@ -12,6 +12,10 @@ defmodule Hyper.Node.Img.Server do and releasing its layers (which then unmount once nothing else holds them). """ + # `:temporary` is load-bearing: on idle this server removes its shared image dm + # chain in `terminate/2`, so a `:permanent` restart would resurrect the chain + # it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper` for why + # coupling resource lifetime to process lifetime is a smell. use GenServer, restart: :temporary require Logger diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex index 0ff7c1b..90a5ff4 100644 --- a/lib/hyper/node/layer/server.ex +++ b/lib/hyper/node/layer/server.ex @@ -9,6 +9,10 @@ defmodule Hyper.Node.Layer.Server do grace period keeps bursty acquire/release cycles from thrashing the mount. """ + # `:temporary` is load-bearing: on idle this server removes its layer's dm + # mount in `terminate/2`, so a `:permanent` restart would resurrect the device + # it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper` for why + # coupling resource lifetime to process lifetime is a smell. use GenServer, restart: :temporary require Logger diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex index 9dfd87f..c6effa8 100644 --- a/lib/hyper/node/reaper.ex +++ b/lib/hyper/node/reaper.ex @@ -68,6 +68,16 @@ defmodule Hyper.Node.Reaper do Process.send_after(self(), :tick, Unit.Time.as_ms(@interval)) end + # TODO(arch): this sweep is a periodic reconciler (desired VMs vs actual + # dm/cgroup state) bolted on as a *backstop* to per-process `terminate/2` + # cleanup. That split is the root fragility behind the `restart: :temporary` + # requirement in `Img.Mutable`/`Img.Server`/`Layer.Server`: resource lifetime is + # coupled to BEAM-process lifetime, so restart strategy is load-bearing and this + # reaper must independently reconstruct liveness (which then has to stay in sync + # with the real owners -- the exact drift that stranded/reaped live volumes). + # A sturdier design promotes this reconciliation to the *primary* cleanup + # mechanism, demoting `terminate/2` to an optimization rather than a correctness + # requirement -- at which point restart strategy stops being load-bearing. @spec sweep(t()) :: t() @decorate with_span("Hyper.Node.Reaper.sweep") defp sweep(%__MODULE__{} = state) do