Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion lib/hyper/node.ex
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ defmodule Hyper.Node do
real-time monitors backing the soft budget (`Hyper.Node.Budget.Soft`).
Lives here, not at the application root, because both are per-node and only
meaningful while this node hosts VMs.

* `Hyper.Node.Reaper` - a periodic, liveness-aware GC for per-VM host
resources (orphaned firecracker cgroups and `hyper-rw-*` dm volumes) stranded
by an unclean death whose vm_id never reboots. Started last so the VM
supervisor it consults for liveness is already up.
"""

use Supervisor
Expand Down Expand Up @@ -61,7 +66,10 @@ defmodule Hyper.Node do
Hyper.Node.Layer,
Hyper.Node.Budget.Supervisor,
{DynamicSupervisor, name: @vm_sup, strategy: :one_for_one},
Hyper.Node.Img
Hyper.Node.Img,
# Last child: :one_for_one starts in order, so the VM supervisor and Img are
# up before the reaper's first tick can read their liveness.
Hyper.Node.Reaper
]

Supervisor.init(children, strategy: :one_for_one)
Expand Down
22 changes: 19 additions & 3 deletions lib/hyper/node/fire_vmm/jailer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,29 @@ defmodule Hyper.Node.FireVMM.Jailer do
end

@doc """
Host path of the VM's cgroup leaf (`/sys/fs/cgroup/<parent>/<exec>/<id>`), the
cgroup the jailer creates for firecracker. Reconstructed (the jailer owns its
Host path of the cgroup dir holding every VM's leaf (`/sys/fs/cgroup/<parent>`).
Its immediate subdir names are vm_ids - the leaves the jailer creates for
firecracker - so listing it (directories only; the dir also holds cgroup control
files) enumerates the cgroup leaves on this host.

Note the cgroup hierarchy has NO `<exec>` level: the jailer (cgroup v2,
`--parent-cgroup <parent>`) places firecracker directly at `<parent>/<id>`,
unlike the chroot (`<chroot_base>/<exec>/<id>`). Confirmed via
`/proc/<pid>/cgroup` = `0::/<parent>/<id>`.
"""
@spec cgroup_parent_dir() :: Path.t()
def cgroup_parent_dir do
Path.join("/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup())
end

@doc """
Host path of the VM's cgroup leaf (`/sys/fs/cgroup/<parent>/<id>`), the cgroup
the jailer creates for firecracker. Reconstructed (the jailer owns its
placement) so a relaunch can clear the stale leaf left by a prior incarnation.
"""
@spec cgroup_dir(Hyper.Vm.Id.t()) :: Path.t()
def cgroup_dir(id) do
Path.join(["/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup(), exec_name(), id])
Path.join(cgroup_parent_dir(), id)
end

@doc """
Expand Down
164 changes: 164 additions & 0 deletions lib/hyper/node/reaper.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
defmodule Hyper.Node.Reaper do
@moduledoc """
Per-node periodic, liveness-aware garbage collector for per-VM host resources
that an unclean BEAM death can strand: a firecracker cgroup leaf and a
`hyper-rw-<id>` dm volume whose owning processes' `terminate/2` never ran and
whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and
the relaunch-time cleanup in the FireVMM path, never get a chance to clear it).

Liveness is the whole point. The reaper consults two independent sources of
truth for "this vm is alive" (`Plan.orphans/3` removes their union from the
candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup
leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A
candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it
is reaped, so a VM caught mid-boot (resources present, not yet registered) is
given a grace tick rather than destroyed.

The decision logic lives in the pure `Hyper.Node.Reaper.Plan`; this module is a
thin I/O adapter that gathers the inputs, calls the plan, and executes the
best-effort, idempotent removals.
"""

use GenServer
use OpenTelemetryDecorator
require Logger

alias Hyper.Cluster.Routing
alias Hyper.Node.FireVMM.Jailer
alias Hyper.Node.Img
alias Hyper.Node.Reaper.Plan
alias Hyper.SuidHelper.{ChrootJail, Dmsetup}

@vm_sup Hyper.Node.VMSupervisor

# Rest between reap ticks. Deliberately not configurable: the two-strike
# confirmation (`Plan.confirm/2`) already means an orphan is reaped at most one
# interval after it is first seen, so the exact value is not load-bearing.
@interval Unit.Time.s(60)

defstruct last_orphans: MapSet.new()

@type t :: %__MODULE__{last_orphans: MapSet.t(String.t())}

@spec start_link(keyword()) :: GenServer.on_start()
def start_link(opts \\ []) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end

@impl true
def init(_opts) do
schedule()
{:ok, %__MODULE__{}}
end

@impl true
def handle_info(:tick, state) do
state = sweep(state)
schedule()
{:noreply, state}
end

# Ignore any unexpected message (port noise, stale timers) rather than crashing.
def handle_info(_msg, state), do: {:noreply, state}

@spec schedule() :: reference()
defp schedule do
Process.send_after(self(), :tick, Unit.Time.as_ms(@interval))
end

@spec sweep(t()) :: t()
@decorate with_span("Hyper.Node.Reaper.sweep")
defp sweep(%__MODULE__{} = state) do
live = gather_live()
leaves = list_cgroup_leaves()
rw = Plan.rw_ids(list_rw_dm())

current = Plan.orphans(live, leaves, rw)
{to_reap, next} = Plan.confirm(current, state.last_orphans)

Enum.each(to_reap, &reap_one/1)
%{state | last_orphans: next}
end

# Over-counting "live" only defers a reap (safe); under-counting destroys a live
# VM (catastrophic). So union two independent liveness sources: the local VM
# supervisor's children and the cluster routing table's view of this node.
@spec gather_live() :: MapSet.t(String.t())
defp gather_live, do: MapSet.union(local_live(), routed_live())

@spec local_live() :: MapSet.t(String.t())
defp local_live do
case Process.whereis(@vm_sup) do
nil ->
MapSet.new()

_pid ->
@vm_sup
|> DynamicSupervisor.which_children()
|> Enum.map(fn {_, child, _, _} -> child end)
|> Enum.filter(&is_pid/1)
|> Enum.map(&Routing.id_for/1)
|> Enum.reject(&is_nil/1)
|> MapSet.new()
end
end

@spec routed_live() :: MapSet.t(String.t())
defp routed_live do
for {id, node} <- Routing.all(), node == node(), into: MapSet.new(), do: id
end

@spec list_cgroup_leaves() :: [String.t()]
defp list_cgroup_leaves do
parent = Jailer.cgroup_parent_dir()

case File.ls(parent) do
{:ok, names} ->
# The parent holds the per-VM leaf directories alongside cgroup control
# files (`cgroup.procs`, `cgroup.controllers`, ...); only the directories
# are vm_id leaves.
Enum.filter(names, &File.dir?(Path.join(parent, &1)))

{:error, :enoent} ->
[]

{:error, reason} ->
Logger.warning("reaper: could not list cgroup leaves: #{inspect(reason)}")
[]
end
end

@spec list_rw_dm() :: [String.t()]
defp list_rw_dm do
case Dmsetup.list() do
{:ok, names} ->
names

{:error, reason} ->
Logger.warning("reaper: could not list dm devices: #{inspect(reason)}")
[]
end
end

@spec reap_one(String.t()) :: :ok
@decorate with_span("Hyper.Node.Reaper.reap_one", include: [:id])
defp reap_one(id) do
Logger.warning("reaper: reaping orphan vm #{id}")

log_result(
"chroot/cgroup",
id,
ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id))
)

log_result("dm volume", id, Dmsetup.remove(Img.Mutable.dm_name(id)))
:ok
end

@spec log_result(String.t(), String.t(), :ok | {:error, term()}) :: :ok
defp log_result(_what, _id, :ok), do: :ok

defp log_result(what, id, {:error, reason}) do
Logger.warning("reaper: removing #{what} for #{id} failed: #{inspect(reason)}")
end
end
32 changes: 32 additions & 0 deletions lib/hyper/node/reaper/plan.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
defmodule Hyper.Node.Reaper.Plan do
@moduledoc """
Pure reap-decision core for `Hyper.Node.Reaper`. No I/O. Every safety invariant
is a property of these functions: a live vm_id is never a candidate, only an
orphan seen on two consecutive ticks is reaped, and only `hyper-rw-*` dm names
yield candidates (so `hyper-thinpool` / `hyper-img-*` can never be reaped).
"""

@rw_prefix "hyper-rw-"

@doc "vm_ids of orphan rw-volumes from raw `dmsetup ls` names (only `hyper-rw-*`)."
@spec rw_ids([String.t()]) :: [String.t()]
def rw_ids(dm_names) do
for name <- dm_names,
String.starts_with?(name, @rw_prefix),
do: String.replace_prefix(name, @rw_prefix, "")
end

@doc "Candidate orphans this tick: (cgroup leaves ∪ rw vm_ids) minus the live set."
@spec orphans(MapSet.t(String.t()), [String.t()], [String.t()]) :: MapSet.t(String.t())
def orphans(live, cgroup_leaves, rw_ids) do
cgroup_leaves
|> MapSet.new()
|> MapSet.union(MapSet.new(rw_ids))
|> MapSet.difference(live)
end

@doc "Two-strike grace: reap only ids that were also orphans last tick. Returns {to_reap, next_last}."
@spec confirm(MapSet.t(String.t()), MapSet.t(String.t())) ::
{MapSet.t(String.t()), MapSet.t(String.t())}
def confirm(current, last), do: {MapSet.intersection(current, last), current}
end
78 changes: 78 additions & 0 deletions test/hyper/node/reaper/plan_properties_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
defmodule Hyper.Node.Reaper.PlanPropertiesTest do
@moduledoc """
Safety laws every `Hyper.Node.Reaper.Plan` decision must obey, generated over a
small shared id alphabet so live / cgroup-leaf / rw sets actually overlap:

* a live vm_id is NEVER a reap candidate (the union of liveness sources is
removed from the orphan set);
* only an orphan seen on two consecutive ticks is reaped (`confirm/2` reaps a
subset of both current and last, and carries `current` forward);
* `hyper-thinpool` / `hyper-img-*` / any non-`hyper-rw-*` name never yields a
candidate, and `Mutable.dm_name(id)` round-trips back to exactly `id` (so a
future id-scheme change that breaks the strip fails loudly here).
"""
use ExUnit.Case, async: true
use ExUnitProperties

alias Hyper.Node.Img.Mutable
alias Hyper.Node.Reaper.Plan

# A deliberately tiny alphabet so generated live/leaf/rw id sets collide often,
# exercising the difference and intersection rather than always being disjoint.
defp id, do: member_of(~w(a b c d e))

defp id_set, do: map(list_of(id()), &MapSet.new/1)

defp id_list, do: list_of(id())

property "a live vm_id is never a reap candidate" do
check all(
live <- id_set(),
leaves <- id_list(),
rw <- id_list()
) do
orphans = Plan.orphans(live, leaves, rw)
assert MapSet.disjoint?(orphans, live)
end
end

property "only twice-seen orphans are reaped; current is carried forward" do
check all(
current <- id_set(),
last <- id_set()
) do
{reap, next} = Plan.confirm(current, last)

assert MapSet.subset?(reap, current)
assert MapSet.subset?(reap, last)
assert next == current
end
end

property "rw_ids excludes thinpool, img, and non-rw junk" do
safe_dm =
member_of([
"hyper-thinpool",
"hyper-img-abc-0",
"hyper-img-deadbeef-3",
"unrelated-device",
"cryptroot"
])

check all(names <- list_of(safe_dm)) do
assert Plan.rw_ids(names) == []
end
end

property "Mutable.dm_name/1 round-trips through rw_ids for a real vm_id" do
check all(
id <-
map(
binary(min_length: 1, max_length: 16),
&("v" <> Base.encode32(&1, padding: false, case: :lower))
)
) do
assert Plan.rw_ids([Mutable.dm_name(id)]) == [id]
end
end
end
65 changes: 65 additions & 0 deletions test/hyper/node/reaper/plan_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
defmodule Hyper.Node.Reaper.PlanTest do
use ExUnit.Case, async: true

alias Hyper.Node.Reaper.Plan

defp set(ids), do: MapSet.new(ids)

describe "orphans/3" do
test "a cgroup-leaf-only orphan is a candidate" do
assert Plan.orphans(set([]), ["dead"], []) == set(["dead"])
end

test "a dm-only orphan is a candidate" do
assert Plan.orphans(set([]), [], ["dead"]) == set(["dead"])
end

test "an id seen in both sources is a single candidate" do
assert Plan.orphans(set([]), ["dead"], ["dead"]) == set(["dead"])
end

test "an id present in live is never a candidate, even if it also has resources" do
assert Plan.orphans(set(["alive"]), ["alive"], ["alive"]) == set([])
end

test "only the non-live ids survive as candidates" do
assert Plan.orphans(set(["alive"]), ["alive", "dead"], ["alive", "gone"]) ==
set(["dead", "gone"])
end
end

describe "confirm/2 two-strike grace" do
test "first tick reaps nothing (last is empty) but remembers the candidates" do
current = set(["x", "y"])
{reap, next} = Plan.confirm(current, set([]))

assert reap == set([])
assert next == current
end

test "second tick reaps the still-orphan ids" do
{_, last} = Plan.confirm(set(["x", "y"]), set([]))
{reap, _next} = Plan.confirm(set(["x", "y"]), last)

assert reap == set(["x", "y"])
end

test "an id orphaned tick1 but live/absent tick2 is not reaped" do
{_, last} = Plan.confirm(set(["x"]), set([]))
{reap, _next} = Plan.confirm(set([]), last)

assert reap == set([])
end

test "an id new in tick2 is not reaped (only one strike)" do
{_, last} = Plan.confirm(set(["x"]), set([]))
{reap, _next} = Plan.confirm(set(["x", "fresh"]), last)

assert reap == set(["x"])
end
end

test "rw_ids/1 strips the hyper-rw- prefix and ignores thinpool/img names" do
assert Plan.rw_ids(["hyper-thinpool", "hyper-img-abc-0", "hyper-rw-vabc"]) == ["vabc"]
end
end
Loading