Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lib/hyper/node/img.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ defmodule Hyper.Node.Img do
alias Hyper.Node.Img.ThinPool

@registry Hyper.Node.Img.Registry
@mutable_registry Hyper.Node.Img.MutableRegistry
@server_sup Hyper.Node.Img.Supervisor
@mutable_sup Hyper.Node.Img.MutableSupervisor

Expand All @@ -31,6 +32,7 @@ defmodule Hyper.Node.Img do
def init(_opts) do
children = [
{Registry, keys: :unique, name: @registry},
{Registry, keys: :unique, name: @mutable_registry},
ThinPool,
{DynamicSupervisor, strategy: :one_for_one, name: @server_sup},
{DynamicSupervisor, strategy: :one_for_one, name: @mutable_sup}
Expand All @@ -42,6 +44,10 @@ defmodule Hyper.Node.Img do
@doc false
def registry, do: @registry

@doc false
@spec mutable_registry() :: atom()
def mutable_registry, do: @mutable_registry

@doc "Activate `img_id` on this node: start (or reuse) its image server."
@spec activate(Hyper.Img.id()) :: {:ok, pid()} | {:error, term()}
def activate(img_id) do
Expand All @@ -55,6 +61,9 @@ defmodule Hyper.Node.Img do
@doc "Create a per-VM mutable layer for `vm_id` over `img_id`."
@spec create_mutable(Hyper.Img.id(), Hyper.Vm.Id.t()) :: {:ok, pid()} | {:error, term()}
def create_mutable(img_id, vm_id) do
# Unlike activate/1, we intentionally do NOT map {:already_started, pid} -> {:ok, pid}:
# vm_ids are unique per VM, so a duplicate vm_id is a bug, not a shared-server reuse.
# Surfacing {:error, {:already_started, pid}} enforces the one-mutable-per-vm invariant.
case DynamicSupervisor.start_child(
@mutable_sup,
{Mutable, %Mutable.Opts{img_id: img_id, vm_id: vm_id}}
Expand Down
17 changes: 15 additions & 2 deletions lib/hyper/node/img/mutable.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ defmodule Hyper.Node.Img.Mutable do
down the RO chain in turn).
"""

use GenServer
# `:temporary` is load-bearing: on idle this server destroys its per-VM thin
# volume in `terminate/2`, so a `:permanent` restart would resurrect the dm
# device it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper`
# for why coupling resource lifetime to process lifetime is a smell.
use GenServer, restart: :temporary

alias Hyper.Node.Img
alias Hyper.Node.Img.{Server, ThinPool}
Expand Down Expand Up @@ -47,7 +51,8 @@ defmodule Hyper.Node.Img.Mutable do
end

@spec start_link(Opts.t()) :: GenServer.on_start()
def start_link(%Opts{} = opts), do: GenServer.start_link(__MODULE__, opts)
def start_link(%Opts{vm_id: vm_id} = opts),
do: GenServer.start_link(__MODULE__, opts, name: via(vm_id))

@spec blk_path(GenServer.server()) :: Path.t()
def blk_path(server), do: GenServer.call(server, :blk_path)
Expand Down Expand Up @@ -146,6 +151,14 @@ defmodule Hyper.Node.Img.Mutable do
@spec dm_name(Hyper.Vm.Id.t()) :: String.t()
def dm_name(vm_id), do: "hyper-rw-#{sanitize(vm_id)}"

@doc "vm_ids of every live mutable layer on this node."
@spec active_vm_ids() :: [Hyper.Vm.Id.t()]
def active_vm_ids do
Registry.select(Img.mutable_registry(), [{{:"$1", :_, :_}, [], [:"$1"]}])
end

defp via(vm_id), do: {:via, Registry, {Img.mutable_registry(), vm_id}}

defp sanitize(id), do: String.replace(id, ~r/[^A-Za-z0-9._-]/, "_")

@spec drop(State.t(), pid()) :: State.t()
Expand Down
6 changes: 5 additions & 1 deletion lib/hyper/node/img/server.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ defmodule Hyper.Node.Img.Server do
and releasing its layers (which then unmount once nothing else holds them).
"""

use GenServer
# `:temporary` is load-bearing: on idle this server removes its shared image dm
# chain in `terminate/2`, so a `:permanent` restart would resurrect the chain
# it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper` for why
# coupling resource lifetime to process lifetime is a smell.
use GenServer, restart: :temporary
require Logger

alias Hyper.Img.Db
Expand Down
6 changes: 5 additions & 1 deletion lib/hyper/node/layer/server.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ defmodule Hyper.Node.Layer.Server do
grace period keeps bursty acquire/release cycles from thrashing the mount.
"""

use GenServer
# `:temporary` is load-bearing: on idle this server removes its layer's dm
# mount in `terminate/2`, so a `:permanent` restart would resurrect the device
# it just tore down. See the reconciliation TODO in `Hyper.Node.Reaper` for why
# coupling resource lifetime to process lifetime is a smell.
use GenServer, restart: :temporary
require Logger

alias Hyper.Node.Layer
Expand Down
35 changes: 28 additions & 7 deletions lib/hyper/node/reaper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ defmodule Hyper.Node.Reaper do
whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and
the relaunch-time cleanup in the FireVMM path, never get a chance to clear it).

Liveness is the whole point. The reaper consults two independent sources of
truth for "this vm is alive" (`Plan.orphans/3` removes their union from the
candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup
leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A
Liveness is the whole point. The reaper consults three independent sources of
truth for "this vm is alive" — the local VM supervisor's children, the cluster
routing table, and the node's live mutable layers (`Img.Mutable.active_vm_ids/0`)
— (`Plan.orphans/3` removes their union from the candidate set) and only ever
touches `hyper-rw-*` dm names and per-VM cgroup leaves - never `hyper-thinpool`,
`hyper-img-*`, or a live VM's resources. A
candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it
is reaped, so a VM caught mid-boot (resources present, not yet registered) is
given a grace tick rather than destroyed.
Expand Down Expand Up @@ -66,6 +68,16 @@ defmodule Hyper.Node.Reaper do
Process.send_after(self(), :tick, Unit.Time.as_ms(@interval))
end

# TODO(arch): this sweep is a periodic reconciler (desired VMs vs actual
# dm/cgroup state) bolted on as a *backstop* to per-process `terminate/2`
# cleanup. That split is the root fragility behind the `restart: :temporary`
# requirement in `Img.Mutable`/`Img.Server`/`Layer.Server`: resource lifetime is
# coupled to BEAM-process lifetime, so restart strategy is load-bearing and this
# reaper must independently reconstruct liveness (which then has to stay in sync
# with the real owners -- the exact drift that stranded/reaped live volumes).
# A sturdier design promotes this reconciliation to the *primary* cleanup
# mechanism, demoting `terminate/2` to an optimization rather than a correctness
# requirement -- at which point restart strategy stops being load-bearing.
@spec sweep(t()) :: t()
@decorate with_span("Hyper.Node.Reaper.sweep")
defp sweep(%__MODULE__{} = state) do
Expand All @@ -81,10 +93,19 @@ defmodule Hyper.Node.Reaper do
end

# Over-counting "live" only defers a reap (safe); under-counting destroys a live
# VM (catastrophic). So union two independent liveness sources: the local VM
# supervisor's children and the cluster routing table's view of this node.
# VM (catastrophic). Union three independent liveness sources: the local VM
# supervisor's children, the cluster routing table's view of this node, and the
# live mutable layers (which own the hyper-rw volumes and exist during the
# mid-boot and idle-grace windows when a vm is in neither of the other two).
@spec gather_live() :: MapSet.t(String.t())
defp gather_live, do: MapSet.union(local_live(), routed_live())
defp gather_live do
local_live()
|> MapSet.union(routed_live())
|> MapSet.union(mutable_live())
end

@spec mutable_live() :: MapSet.t(String.t())
defp mutable_live, do: MapSet.new(Img.Mutable.active_vm_ids())

@spec local_live() :: MapSet.t(String.t())
defp local_live do
Expand Down
66 changes: 66 additions & 0 deletions test/hyper/node/img/mutable_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
defmodule Hyper.Node.Img.MutableTest do
use ExUnit.Case, async: false

alias Hyper.Node.Img
alias Hyper.Node.Img.Mutable

setup do
# The full app does not start in the test env, so stand up just the registry
# the mutable layers register into. Reuse the real name so we exercise the
# exact lookup `active_vm_ids/0` performs.
name = Img.mutable_registry()

unless Process.whereis(name) do
start_supervised!({Registry, keys: :unique, name: name})
end

:ok
end

defp register_live(vm_id) do
test = self()

{:ok, pid} =
Task.start_link(fn ->
{:ok, _} = Registry.register(Img.mutable_registry(), vm_id, nil)
send(test, {:registered, vm_id})
Process.sleep(:infinity)
end)

assert_receive {:registered, ^vm_id}
pid
end

test "active_vm_ids lists the vm_ids of every live mutable layer" do
register_live("vm-a")
register_live("vm-b")

assert Enum.sort(Mutable.active_vm_ids()) == ["vm-a", "vm-b"]
end

test "active_vm_ids drops a vm_id once its mutable layer dies" do
pid = register_live("vm-gone")
ref = Process.monitor(pid)
Process.unlink(pid)
Process.exit(pid, :kill)
assert_receive {:DOWN, ^ref, :process, ^pid, _}

# Registry removes the entry on the registered process's :DOWN. Poll briefly
# so the test is robust to that async unregister without a fixed sleep.
assert eventually(fn -> Mutable.active_vm_ids() == [] end)
end

defp eventually(fun, attempts \\ 50) do
cond do
fun.() ->
true

attempts == 0 ->
false

true ->
Process.sleep(2)
eventually(fun, attempts - 1)
end
end
end
43 changes: 43 additions & 0 deletions test/hyper/node/reaper/liveness_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
defmodule Hyper.Node.Reaper.LivenessTest do
# Guards the Img.Mutable.active_vm_ids/0 + Plan.orphans/3 contract: a live mutable
# layer protects its vm_id from reaping. The gather_live/0 union of this source into
# the full live set is verified by manual live-node testing (Task 4), not CI.
use ExUnit.Case, async: false

alias Hyper.Node.Img
alias Hyper.Node.Img.Mutable
alias Hyper.Node.Reaper.Plan

setup do
name = Img.mutable_registry()

unless Process.whereis(name) do
start_supervised!({Registry, keys: :unique, name: name})
end

:ok
end

defp register_live(vm_id) do
test = self()

{:ok, _pid} =
Task.start_link(fn ->
{:ok, _} = Registry.register(Img.mutable_registry(), vm_id, nil)
send(test, {:registered, vm_id})
Process.sleep(:infinity)
end)

assert_receive {:registered, ^vm_id}
end

test "a vm with a live mutable layer is never an orphan, even with a leftover rw device" do
register_live("vm-live")

live = MapSet.new(Mutable.active_vm_ids())

# The reaper would see hyper-rw-vm-live in `dmsetup ls` (rw candidate) and no
# cgroup leaf, yet the live mutable owner must protect it from reaping.
assert Plan.orphans(live, [], ["vm-live"]) == MapSet.new([])
end
end
23 changes: 23 additions & 0 deletions test/hyper/node/refcounted_restart_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defmodule Hyper.Node.RefcountedRestartTest do
use ExUnit.Case, async: true

# These three servers are monitor-refcounted and idle-reap: when their last
# holder goes away they self-terminate with `{:stop, :normal}`, destroying an
# external device-mapper resource in `terminate/2`. Under a DynamicSupervisor a
# `:permanent` child is RESTARTED on that intentional `:normal` exit, and its
# `init/1` re-creates the resource it just tore down -- an endless resurrection
# loop that leaks dm devices. They MUST be `:temporary` so idle-teardown is
# final. This pins that invariant for every server in the refcount tier and for
# any new one added later.
@idle_reaping_servers [
Hyper.Node.Img.Mutable,
Hyper.Node.Img.Server,
Hyper.Node.Layer.Server
]

for mod <- @idle_reaping_servers do
test "#{inspect(mod)} is restart: :temporary so its idle :stop is not undone" do
assert unquote(mod).child_spec([]).restart == :temporary
end
end
end
Loading