diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex index 1f60f8f7..9491a132 100644 --- a/lib/hyper/cluster/routing.ex +++ b/lib/hyper/cluster/routing.ex @@ -28,6 +28,26 @@ defmodule Hyper.Cluster.Routing do @spec via(term()) :: {:via, module(), {atom(), term()}} def via(key), do: {:via, Horde.Registry, {@name, key}} + @doc """ + Register the calling process under `key` from inside its own `init`. + + Prefer this over starting a process with a `{:via, Horde.Registry, _}` name. + OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name` + immediately after the synchronous `register`, but Horde materialises the name + into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under + registry churn that read loses the race and OTP aborts startup with + `{:process_not_registered_via, Horde.Registry}`. Registering from within + `init` carries no such self-check, while leaving the name cluster-resolvable + through `via/1` once the diff propagates (callers already tolerate that lag). + """ + @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}} + def register_self(key) do + case Horde.Registry.register(@name, key, nil) do + {:ok, _pid} -> :ok + {:error, {:already_registered, _pid}} = err -> err + end + end + @doc "Which node currently runs `vm_id`? `nil` if unknown." @spec whereis(Hyper.Vm.Id.t()) :: node() | nil @decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id]) diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 49c12cd7..4f172e88 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -161,7 +161,7 @@ defmodule Hyper.Node do @spec test_system :: :ok | {:error, term()} def test_system do with {:ok, _} <- Hyper.Cfg.Budget.load(), - :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(), + :ok <- check_firecracker_bins(), :ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(), :ok <- Hyper.Node.Vmlinux.test_system(), :ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(), @@ -175,6 +175,24 @@ defmodule Hyper.Node do end end + @spec check_firecracker_bins :: + :ok + | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + | {:error, :firecracker_not_configured | :jailer_not_configured} + defp check_firecracker_bins do + with {:fc, {:ok, fc}} <- {:fc, Hyper.Cfg.Tools.firecracker_configured()}, + {:jail, {:ok, jail}} <- {:jail, Hyper.Cfg.Tools.jailer_configured()} do + cond do + not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} + not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} + true -> :ok + end + else + {:fc, :error} -> {:error, :firecracker_not_configured} + {:jail, :error} -> {:error, :jailer_not_configured} + end + end + @spec check_helper_base(Path.t()) :: :ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}} defp check_helper_base(base) do diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex index 12f8236b..22b069ac 100644 --- a/lib/hyper/node/fire_vmm.ex +++ b/lib/hyper/node/fire_vmm.ex @@ -47,14 +47,15 @@ defmodule Hyper.Node.FireVMM do @spec start_link(Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id)) + Supervisor.start_link(__MODULE__, opts) end + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(opts) do # Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by # the node-level DynamicSupervisor. %{ - vm_id: {__MODULE__, opts.vm_id}, + id: {__MODULE__, opts.vm_id}, start: {__MODULE__, :start_link, [opts]}, type: :supervisor, restart: :transient @@ -63,18 +64,26 @@ defmodule Hyper.Node.FireVMM do @impl true def init(opts) do - children = [ - # Client must be registered before Core: Core starts the State machine, - # which calls Client.run while waiting for the daemon's API. Client depends - # only on vm_id (an independent peer), so it has no reverse dependency. - {Client, %Client.Opts{vm_id: opts.vm_id}}, - {Core, opts} - ] + # Self-register the cluster routing entry here rather than via a start name; + # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never + # collides, so `:already_registered` only happens against a stale dead + # incarnation - decline the start and let the supervisor retry clean. + case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do + :ok -> + children = [ + # Client must be registered before Core: Core starts the State machine, + # which calls Client.run while waiting for the daemon's API. Client + # depends only on vm_id (an independent peer), so no reverse dependency. + {Client, %Client.Opts{vm_id: opts.vm_id}}, + {Core, opts} + ] - Supervisor.init(children, strategy: :one_for_one) - end + Supervisor.init(children, strategy: :one_for_one) - defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor}) + {:error, _} -> + :ignore + end + end @doc "Test whether the system can run firecracker VMMs." @spec test_system() :: :ok | {:error, term()} diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex index f3911bcf..c41b5f1d 100644 --- a/lib/hyper/node/fire_vmm/client.ex +++ b/lib/hyper/node/fire_vmm/client.ex @@ -55,15 +55,12 @@ defmodule Hyper.Node.FireVMM.Client do @type t :: %__MODULE__{socket_path: Path.t()} end + # Prod path (vm_id, no explicit name) starts unnamed and self-registers in + # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override + # (test stand-ins) is honoured as a plain local name and skips registration. @spec start_link(Opts.t()) :: GenServer.on_start() def start_link(%Opts{} = opts) do - name = - case opts.name do - nil when not is_nil(opts.vm_id) -> via(opts.vm_id) - other -> other - end - - GenServer.start_link(__MODULE__, opts, gen_opts(name)) + GenServer.start_link(__MODULE__, opts, gen_opts(opts.name)) end @spec via(Hyper.Vm.Id.t()) :: GenServer.name() @@ -79,12 +76,26 @@ defmodule Hyper.Node.FireVMM.Client do end @impl true - @spec init(Opts.t()) :: {:ok, State.t()} + @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}} def init(%Opts{} = opts) do - socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) - {:ok, %State{socket_path: socket_path}} + with :ok <- register(opts) do + socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) + {:ok, %State{socket_path: socket_path}} + end + end + + # Register cluster-wide under {vm_id, :client} on the prod path. With an + # explicit name (test stand-in), the name is the local registration, so skip. + @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}} + defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do + case Hyper.Cluster.Routing.register_self({vm_id, :client}) do + :ok -> :ok + {:error, reason} -> {:stop, reason} + end end + defp register(%Opts{}), do: :ok + @impl true def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do {:reply, op_fun.(socket_path: socket_path), state} diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex index 8ccf6180..34830b54 100644 --- a/lib/hyper/node/fire_vmm/core.ex +++ b/lib/hyper/node/fire_vmm/core.ex @@ -18,8 +18,9 @@ defmodule Hyper.Node.FireVMM.Core do * firecracker crash -> the `Daemon` child exits; both restart; `Daemon` resets the stale jail and relaunches, and the fresh controller cold-boots. - `MuonTrap` kills the OS process when its port closes (teardown or BEAM death), - so no firecracker process outlives the supervisor. + `Daemon` guarantees firecracker is dead on teardown via the helper's + `cgroup.kill` (MuonTrap's port-close kill misses the setsid'd firecracker), so + no firecracker process outlives a graceful supervisor shutdown. """ use Supervisor @@ -28,9 +29,12 @@ defmodule Hyper.Node.FireVMM.Core do alias Hyper.Node.FireVMM.Daemon alias Hyper.Node.FireVMM.State + # Started unnamed: nothing resolves the core by name (it is addressed as a + # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a + # needless racy Horde registration at startup. @spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core})) + Supervisor.start_link(__MODULE__, opts) end @impl true diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex index 7cbc159c..882a6cca 100644 --- a/lib/hyper/node/fire_vmm/daemon.ex +++ b/lib/hyper/node/fire_vmm/daemon.ex @@ -3,26 +3,37 @@ defmodule Hyper.Node.FireVMM.Daemon do The jailed firecracker OS process for one microVM, as a static child of `Hyper.Node.FireVMM.Core`. - Lifecycle is supervisor-owned. On every (re)start it first resets any stale - jail left by a prior incarnation - the firecracker jailer refuses to reuse an - existing chroot - then builds the jailer command and runs it under - `MuonTrap.Daemon`, which kills the OS process when its port closes (controller - crash, container teardown, or BEAM death). So no firecracker process outlives - the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a - firecracker crash) cleanly cold-boots against a fresh jail. - - The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the - reset, then delegates and returns that pid. + A `trap_exit` GenServer that owns firecracker's lifetime end to end: + + * on every (re)start it resets any stale jail left by a prior incarnation — + the firecracker jailer refuses to reuse an existing chroot — then launches + the jailer under a linked `MuonTrap.Daemon`. The supervised process is + `hyper-suidhelper jailer ...`, which `execve`s into the jailer (same pid). + * if firecracker exits, the linked `MuonTrap.Daemon` exits and this server + stops with that reason, so `Core`'s `:one_for_all` cold-boots the pair. + * on teardown it **guarantees firecracker is dead**: `MuonTrap`'s port-close + kills by process group, but the jailer `setsid`s firecracker into its own + session, so it escapes that kill and would leak (holding the cgroup, the + rootfs dm device, loop devices). `terminate/2` therefore runs the helper's + `cgroup.kill` teardown (`ChrootJail.remove`), which SIGKILLs the whole leaf + cgroup regardless of session. The same call on (re)start cleans up after a + prior incarnation the BEAM could not (a SIGKILL'd node leaves no + `terminate/2`); the periodic `Hyper.Node.Reaper` is the final backstop. """ + use GenServer + use OpenTelemetryDecorator + alias Hyper.Node.FireVMM.{Jailer, Opts} alias Hyper.SuidHelper alias Unit.Time - use OpenTelemetryDecorator + require Logger @shutdown_timeout Time.s(5) + defstruct [:opts, :muontrap] + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(%Opts{} = opts) do %{ @@ -34,20 +45,82 @@ defmodule Hyper.Node.FireVMM.Daemon do } end - @doc """ - Reset the VM's stale jail, then launch the jailer under `MuonTrap.Daemon` and - return its pid. Fails (so the supervisor retries) if the reset cannot run. - """ - @spec start_link(Opts.t()) :: {:ok, pid()} | {:error, term()} - @decorate with_span("Hyper.Node.FireVMM.Daemon.start_link", include: [:id]) - def start_link(%Opts{vm_id: id} = opts) do - with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do - cmd = Jailer.command(opts) - - case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do - {:ok, pid} -> {:ok, pid} - {:error, _} = err -> err - end + @spec start_link(Opts.t()) :: GenServer.on_start() + def start_link(%Opts{} = opts) do + GenServer.start_link(__MODULE__, opts) + end + + @impl true + @decorate with_span("Hyper.Node.FireVMM.Daemon.init", include: [:id]) + def init(%Opts{vm_id: id} = opts) do + # Trap exits so the linked MuonTrap's exit reaches `handle_info` (not a silent + # link kill) and so `terminate/2` runs on supervisor shutdown. + Process.flag(:trap_exit, true) + + with :ok <- reset_stale_jail(id), + {:ok, muontrap} <- launch(opts) do + {:ok, %__MODULE__{opts: opts, muontrap: muontrap}} + else + {:error, reason} -> {:stop, reason} + end + end + + # firecracker (the linked MuonTrap.Daemon) exited: stop with its reason so + # `Core`'s `:one_for_all` discards the controller too and cold-boots the pair. + @impl true + def handle_info({:EXIT, muontrap, reason}, %__MODULE__{muontrap: muontrap} = state) do + {:stop, reason, state} + end + + def handle_info(_msg, state), do: {:noreply, state} + + # Guarantee firecracker is dead and its jail cleared. MuonTrap cannot kill the + # setsid'd firecracker; the helper's `cgroup.kill` can. Best-effort: a failure + # here is logged, and the `Reaper` will retry, but it must not crash teardown. + @impl true + @decorate with_span("Hyper.Node.FireVMM.Daemon.terminate", include: [:id]) + def terminate(_reason, %__MODULE__{opts: %Opts{vm_id: id}}) do + case clear_jail(id) do + :ok -> + :ok + + {:error, reason} -> + Logger.error("vm #{id}: teardown failed to clear jail: #{inspect(reason)}") + end + end + + @spec reset_stale_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()} + defp reset_stale_jail(id), do: clear_jail(id) + + @spec clear_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()} + defp clear_jail(id) do + SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) + end + + @spec launch(Opts.t()) :: {:ok, pid()} | {:error, term()} + defp launch(%Opts{vm_id: id} = opts) do + cmd = Jailer.command(opts) + + # Surface what the jailed process actually does: `log_output` routes the + # helper/jailer/firecracker stdout+stderr (guest serial console included) + # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque + # `:error_exit_status` into `{:firecracker_exited, status}` so a crash + # report names the real exit code instead of hiding it. + daemon_opts = [ + log_output: :info, + log_prefix: "vm #{id} firecracker: ", + stderr_to_stdout: true, + exit_status_to_reason: &{:firecracker_exited, &1} + ] + + case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do + {:ok, pid} -> + Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})") + {:ok, pid} + + {:error, reason} = err -> + Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}") + err end end end diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex deleted file mode 100644 index 0730c588..00000000 --- a/lib/hyper/node/fire_vmm/provider.ex +++ /dev/null @@ -1,92 +0,0 @@ -defmodule Hyper.Node.FireVMM.Provider do - @moduledoc """ - Installs the firecracker release for the current architecture into - `Hyper.Cfg.Dirs.firecracker_install_dir/0` (`/redist/firecracker`). - - `ensure_installed/0` is idempotent: if the binaries are already present and - executable it returns `:ok` without touching the network. Otherwise it fetches - the official firecracker release tarball for the detected architecture via - `Redist.Targz` (download, SHA-256 verify, extract). The archive is - extracted as-is - the binaries live under `release-v-/` exactly as - firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those - paths. - """ - - alias Redist.Targz - - @downloads %{ - x86_64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz", - sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", - firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64", - jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64" - }, - aarch64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz", - sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", - firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64", - jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64" - } - } - - @doc "Absolute path to the installed firecracker binary." - @spec firecracker_bin() :: Path.t() - def firecracker_bin, do: bin_path(:firecracker) - - @doc "Absolute path to the installed jailer binary." - @spec jailer_bin() :: Path.t() - def jailer_bin, do: bin_path(:jailer) - - @doc "Ensure the firecracker release is installed for this node." - @spec ensure_installed() :: :ok | {:error, term()} - def ensure_installed do - with {:ok, arch} <- Sys.Arch.current() do - dl = Map.fetch!(@downloads, arch) - - case check_install(dl) do - :ok -> :ok - {:error, :not_installed} -> install(dl) - {:error, :bad_install} -> reinstall(dl) - end - end - end - - # `:ok` if `dl`'s version-specific binaries are present and executable; - # `{:error, :not_installed}` if the install dir is empty/absent; otherwise - # `{:error, :bad_install}` - something is there but it's the wrong version, - # partial, or corrupt, which we cannot fix in place because `Targz` keeps - # existing files. The remedy is to wipe and reinstall. - @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install} - defp check_install(dl) do - fc = Path.join(install_dir(), dl.firecracker) - jail = Path.join(install_dir(), dl.jailer) - - cond do - Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) -> - :ok - - File.dir?(install_dir()) and File.ls!(install_dir()) != [] -> - {:error, :bad_install} - - true -> - {:error, :not_installed} - end - end - - defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir()) - - defp reinstall(dl) do - _ = File.rm_rf!(install_dir()) - install(dl) - end - - defp bin_path(key) do - {:ok, arch} = Sys.Arch.current() - dl = Map.fetch!(@downloads, arch) - Path.join(install_dir(), Map.fetch!(dl, key)) - end - - defp install_dir, do: Hyper.Cfg.Dirs.firecracker_install_dir() -end diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index 2a37c796..eab53db5 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do } @enforce_keys [:opts] - defstruct [:opts, :spec, :boot_deadline] + defstruct [:opts, :spec, :boot_deadline, api_granted: false] @type t :: %State{ opts: Opts.t(), spec: BootSpec.Cold.t() | nil, - boot_deadline: integer() | nil + boot_deadline: integer() | nil, + api_granted: boolean() } # How long to wait for the daemon's API to come up before failing the boot. @@ -54,8 +55,11 @@ defmodule Hyper.Node.FireVMM.State do %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}} end - def start_link(%Opts{vm_id: id} = opts) do - :gen_statem.start_link(via(id), __MODULE__, opts, []) + # Started unnamed; the controller self-registers under `{id, :state}` from + # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves + # it cluster-wide through `via/1`. + def start_link(%Opts{} = opts) do + :gen_statem.start_link(__MODULE__, opts, []) end @spec stop(Hyper.Vm.Id.t()) :: :ok @@ -72,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do # The daemon is already (being) started by `Core` as our sibling. Read the root # device off the per-VM mutable layer, resolve the boot spec, set the readiness # deadline, and start probing the API. - def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do - spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) - deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) - data = %State{opts: opts, spec: spec, boot_deadline: deadline} - - {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + def init( + %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = + opts + ) do + case Hyper.Cluster.Routing.register_self({id, :state}) do + :ok -> + spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) + deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) + data = %State{opts: opts, spec: spec, boot_deadline: deadline} + + {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + + {:error, reason} -> + {:stop, reason} + end end # Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel + @@ -110,31 +123,81 @@ defmodule Hyper.Node.FireVMM.State do @moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`." alias Hyper.Firecracker.Api.{InstanceInfo, Operations} - alias Hyper.Node.FireVMM.{Client, Opts} + alias Hyper.Node.FireVMM.{Client, Jailer, Opts} + alias Hyper.SuidHelper.ChrootJail alias Unit.Time + require Logger + # How often to probe the daemon's API while waiting for it. @probe_interval Time.ms(50) - # Poll the daemon's API until it answers, then configure. Give up if the - # readiness deadline passes first. + # Hand the jailed API socket to the node user, then poll the daemon's API + # until it answers and advance to `:configuring`. Give up if the readiness + # deadline passes first. The grant must happen before the probe: firecracker + # creates the socket owned by the per-VM uid, so the unprivileged controller + # gets EACCES on connect until the helper chowns it to us. def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do - case Client.run(Client.via(id), &Operations.describe_instance/1) do - {:ok, %InstanceInfo{}} -> - {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} - - {:error, _reason} -> - if System.monotonic_time(:millisecond) >= data.boot_deadline do - {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data} - else - {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + case ensure_api_granted(id, data) do + {:cont, data} -> + case Client.run(Client.via(id), &Operations.describe_instance/1) do + {:ok, %InstanceInfo{}} -> + {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} + + {:error, reason} -> + keep_probing(id, data, reason) end + + {:wait, data, reason} -> + keep_probing(id, data, reason) end end def handle({:call, from}, :stop, data) do {:next_state, :stopping, data, [{:reply, from, :ok}]} end + + # Ensure the jailed API socket has been handed to the node user. Idempotent + # once granted (we record it in `data` so we ask the helper only once). + # `:socket_pending` means firecracker has not created the socket yet, so we + # keep waiting; a hard error is logged but also tolerated until the deadline + # (the probe that follows would fail with EACCES anyway and drive the stop). + @spec ensure_api_granted(Hyper.Vm.Id.t(), State.t()) :: + {:cont, State.t()} | {:wait, State.t(), term()} + defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data} + + defp ensure_api_granted(id, data) do + case ChrootJail.grant_api(Jailer.host_socket(id)) do + :ok -> + {:cont, %{data | api_granted: true}} + + {:error, :socket_pending} -> + {:wait, data, :socket_pending} + + {:error, reason} -> + Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}") + {:wait, data, {:grant_api, reason}} + end + end + + # Keep waiting for readiness, re-arming the probe timer, unless the deadline + # has lapsed - then fail the boot, surfacing `reason` rather than swallowing + # it into a bare `:daemon_unready`. A persistent failure here points at the + # host->jail socket (path or, more often, the grant/permission step above). + @spec keep_probing(Hyper.Vm.Id.t(), State.t(), term()) :: + {:keep_state, State.t(), list()} | {:stop, term(), State.t()} + defp keep_probing(id, data, reason) do + if System.monotonic_time(:millisecond) >= data.boot_deadline do + Logger.warning( + "vm #{id}: firecracker API not reachable before deadline; " <> + "last probe error: #{inspect(reason)}" + ) + + {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} + else + {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + end + end end defmodule Configuring do @@ -145,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations} alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts} + require Logger + # Stage boot artifacts into the chroot, then issue the pre-boot config and - # start the guest. + # start the guest. Both failure paths end the boot via a supervisor restart, + # so log the reason here - otherwise it vanishes into the `:one_for_all` + # cycle and the VM just appears to relaunch for no visible cause. def handle( :state_timeout, :configure, @@ -155,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do case ChrootJail.stage(id, uid, gid, spec) do {:ok, jailed_spec} -> case apply_spec(id, jailed_spec) do - :ok -> {:next_state, :running, data} - {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data} + :ok -> + Logger.info("vm #{id}: configured, guest starting") + {:next_state, :running, data} + + {:error, reason} -> + Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}") + {:stop, {:shutdown, {:boot_failed, reason}}, data} end {:error, reason} -> + Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}") {:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data} end end diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex index 7e358758..1be0bbfe 100644 --- a/lib/hyper/suid_helper/dmsetup.ex +++ b/lib/hyper/suid_helper/dmsetup.ex @@ -65,32 +65,6 @@ defmodule Hyper.SuidHelper.Dmsetup do end end - @doc "Names of the existing dm devices." - @spec list() :: {:ok, [String.t()]} | {:error, err()} - @decorate with_span("Hyper.SuidHelper.Dmsetup.list") - def list do - case SuidHelper.exec(["dmsetup", "ls"]) do - {:ok, %{"output" => out}} -> {:ok, parse_names(out)} - {:error, _} = err -> err - end - end - - @doc false - @spec parse_names(String.t()) :: [String.t()] - def parse_names(out) do - case String.trim(out) do - # `dmsetup ls` prints this sentinel (not a device row) when there are none. - "No devices found" -> - [] - - _ -> - out - |> String.split("\n", trim: true) - |> Enum.map(&(&1 |> String.split() |> List.first())) - |> Enum.reject(&is_nil/1) - end - end - @doc "Send a thin-pool `message` to dm device `name`." @spec message(String.t(), String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message]) @@ -133,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do |> MapSet.new() end + @doc "Names of every device-mapper device currently present on this host." + @spec list() :: {:ok, [String.t()]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Dmsetup.list") + def list do + case SuidHelper.exec(["dmsetup", "ls"]) do + {:ok, %{"output" => out}} -> {:ok, parse_names(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_names(String.t()) :: [String.t()] + def parse_names(out) do + case String.trim(out) do + # `dmsetup ls` prints this sentinel (not a device row) when there are none. + "No devices found" -> + [] + + _ -> + out + |> String.split("\n", trim: true) + |> Enum.map(&(&1 |> String.split() |> List.first())) + |> Enum.reject(&is_nil/1) + end + end + @doc false @spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t() def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do diff --git a/native/suidhelper/src/tools/dmsetup/mod.rs b/native/suidhelper/src/tools/dmsetup/mod.rs index cdbc737e..d77674df 100644 --- a/native/suidhelper/src/tools/dmsetup/mod.rs +++ b/native/suidhelper/src/tools/dmsetup/mod.rs @@ -52,11 +52,11 @@ enum DmOp { #[arg(long)] message: ThinMessage, }, - /// List the names of existing dm devices (for stale-device reclaim). - Ls, /// List the target types the kernel device-mapper exposes. Read-only, but /// still needs root: it opens `/dev/mapper/control`. Targets, + /// List the names of existing dm devices (for stale-device reclaim). + Ls, } #[derive(Serialize)] @@ -65,8 +65,8 @@ pub enum DmsetupOut { Created { device: PathBuf }, Removed, Messaged, - Listed { output: String }, Targets { output: String }, + Listed { output: String }, } pub struct Dmsetup { @@ -112,12 +112,12 @@ impl IsTool for Dmsetup { .arg("0") .arg(message.to_string()); } - DmOp::Ls => { - cmd.arg("ls"); - } DmOp::Targets => { cmd.arg("targets"); } + DmOp::Ls => { + cmd.arg("ls"); + } } cmd.env_clear().output() @@ -137,10 +137,10 @@ impl IsTool for Dmsetup { }, DmOp::Remove { .. } => DmsetupOut::Removed, DmOp::Message { .. } => DmsetupOut::Messaged, - DmOp::Ls => DmsetupOut::Listed { + DmOp::Targets => DmsetupOut::Targets { output: String::from_utf8_lossy(&out.stdout).into_owned(), }, - DmOp::Targets => DmsetupOut::Targets { + DmOp::Ls => DmsetupOut::Listed { output: String::from_utf8_lossy(&out.stdout).into_owned(), }, })