From 13c75c5f3a0523efe24737a11d8dc103974e799e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 21:00:22 +0000 Subject: [PATCH 1/2] feat(suidhelper): run the firecracker jailer through the setuid helper The BEAM no longer invokes the jailer binary directly. It calls the helper with a new `jailer` subcommand passing only untrusted-origin values (--id, --uid, --gid, repeated --cgroup KEY=VALUE, --api-sock); the helper reads the firecracker/jailer binary paths, chroot base, parent cgroup and accepted uid/gid band from its trusted /etc/hyper/config.toml, validates the caller's args, re-acquires permanent root, and execve's the jailer in place (same pid, so MuonTrap.Daemon keeps supervising it). Config: the helper's tool paths move into a `[tools]` table (joining the existing dmsetup/losetup/blockdev with new firecracker/jailer keys) and jail placement into `[jails]` (cgroup, uid_gid_range) -- matching the namespaces the Elixir node already reads. firecracker/jailer have no default and surface BinError::Unconfigured when absent; a present uid_gid_range with min==0 or min>max is fatal at load. Adds setuid_privileged::become_root_permanently (seal real+effective+saved uid/gid to 0, reset supplementary groups) for the execve handoff -- no Drop guard because execve replaces the image and the jailer drops its own privs. jailer.rs validates every caller value with refusal-first proptests (vm_id, id range, cgroup KEY=VALUE, api-sock shape). Elixir Jailer.command/2 emits the trimmed argv against Cfg.Tools.suidhelper(). --- lib/hyper/node/fire_vmm/jailer.ex | 54 +-- native/suidhelper/Cargo.toml | 14 +- native/suidhelper/src/config.rs | 179 ++++++++- native/suidhelper/src/main.rs | 15 +- native/suidhelper/src/tools/jailer.rs | 358 ++++++++++++++++++ native/suidhelper/src/tools/mod.rs | 1 + .../suidhelper/src/util/setuid_privileged.rs | 39 +- .../suidhelper/tests/config_uid_gid_range.rs | 59 +++ native/suidhelper/tests/e2e/argv.rs | 68 +++- native/suidhelper/tests/e2e/config.rs | 94 +++++ native/suidhelper/tests/e2e/jailer.rs | 209 ++++++++++ native/suidhelper/tests/tools/jailer.rs | 162 ++++++++ test/hyper/node/fire_vmm/jailer_test.exs | 83 ++++ 13 files changed, 1278 insertions(+), 57 deletions(-) create mode 100644 native/suidhelper/src/tools/jailer.rs create mode 100644 native/suidhelper/tests/config_uid_gid_range.rs create mode 100644 native/suidhelper/tests/e2e/jailer.rs create mode 100644 native/suidhelper/tests/tools/jailer.rs create mode 100644 test/hyper/node/fire_vmm/jailer_test.exs diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex index c9df6247..3b6deb23 100644 --- a/lib/hyper/node/fire_vmm/jailer.ex +++ b/lib/hyper/node/fire_vmm/jailer.ex @@ -1,27 +1,26 @@ defmodule Hyper.Node.FireVMM.Jailer do @moduledoc """ - Builds the firecracker - [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md) - command for one VM. + Builds the `hyper-suidhelper jailer` command for one VM. - The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance` - flags) and drops privileges, then exec's firecracker. We run the jailer (not - firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS - process, the jailer owns isolation. + The BEAM does not invoke the jailer directly. Instead it calls the setuid helper + with the `jailer` subcommand; the helper reads the firecracker binary path, chroot + base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`, + re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps + supervising it). + + This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`, + repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates + everything else; it also inserts the `--` separator between its own flags and + firecracker's flags. Because firecracker is chrooted to `///root`, the API - socket it opens at `/api.socket` lives at `host_socket` on the host - that's the + socket it opens at `/api.socket` lives at `host_socket` on the host — that's the path the controller connects to. - - Host config: paths are derived from `config :hyper, work_dir: ...`. The - firecracker + jailer binaries are installed under `/redist/firecracker` - by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`. """ use OpenTelemetryDecorator alias Hyper.Node.FireVMM - alias Hyper.Node.FireVMM.Provider alias Hyper.Vm.Instance # firecracker's API socket path *inside* the chroot. @@ -36,6 +35,8 @@ defmodule Hyper.Node.FireVMM.Jailer do first failure. """ + alias Hyper.Cfg.{Dirs, Jails} + @doc "Run every pre-requisite check, halting at the first failure." @spec run() :: :ok | {:error, term()} def run do @@ -61,7 +62,7 @@ defmodule Hyper.Node.FireVMM.Jailer do end defp parent_cgroup_present do - if Sys.Linux.Cgroup.V2.named_exists?(Hyper.Cfg.Jails.cgroup()), + if Sys.Linux.Cgroup.V2.named_exists?(Jails.cgroup()), do: :ok, else: {:error, :missing_parent_cgroup} end @@ -77,7 +78,7 @@ defmodule Hyper.Node.FireVMM.Jailer do end defp chroot_writable do - case Sys.Posix.ensure_writable_dir(Hyper.Cfg.Dirs.chroot_base()) do + case Sys.Posix.ensure_writable_dir(Dirs.chroot_base()) do {:ok} -> :ok {:error, reason} -> {:error, {:chroot_base_unavailable, reason}} end @@ -92,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do @spec command(FireVMM.Opts.t()) :: t() def command(opts) do args = - [ - "--id", - opts.vm_id, - "--exec-file", - Provider.firecracker_bin(), - "--uid", - to_string(opts.uid), - "--gid", - to_string(opts.gid), - "--chroot-base-dir", - Hyper.Cfg.Dirs.chroot_base(), - "--cgroup-version", - "2", - "--parent-cgroup", - Hyper.Cfg.Jails.cgroup() - ] ++ + ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++ cgroup_flags(opts.type) ++ - ["--", "--api-sock", "/" <> @jail_socket] + ["--api-sock", "/" <> @jail_socket] - %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)} + %{binary: Hyper.Cfg.Tools.suidhelper(), args: args, host_socket: host_socket(opts.vm_id)} end # Find the appropriate jailer cgroup flags for the given instance type. @@ -180,5 +166,5 @@ defmodule Hyper.Node.FireVMM.Jailer do ]) end - defp exec_name, do: Path.basename(Provider.firecracker_bin()) + defp exec_name, do: Path.basename(Hyper.Cfg.Tools.firecracker()) end diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index fbb4524a..be6a449c 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs" name = "tools_dmsetup_parsers" path = "tests/tools/dmsetup_parsers.rs" +[[test]] +name = "tools_jailer" +path = "tests/tools/jailer.rs" + [[test]] name = "util_confinement" path = "tests/util/confinement.rs" @@ -72,14 +76,22 @@ path = "tests/e2e/config.rs" name = "e2e_argv" path = "tests/e2e/argv.rs" +[[test]] +name = "e2e_jailer" +path = "tests/e2e/jailer.rs" + [[test]] name = "e2e_chroot_jail" path = "tests/e2e/chroot_jail.rs" +[[test]] +name = "config_uid_gid_range" +path = "tests/config_uid_gid_range.rs" + [dependencies] clap = { version = "4", features = ["derive"] } hyper-suidhelper-meta = { path = "meta" } -nix = { version = "0.29", features = ["user", "fs", "dir"] } +nix = { version = "0.29", features = ["user", "fs", "dir", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "1" diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 2c94ba03..41b00a75 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -1,5 +1,12 @@ // SPDX-License-Identifier: AGPL-3.0-only //! Runtime host configuration, read from a single root-owned TOML file. +//! +//! ## UID/GID range divergence +//! +//! Elixir keeps a default `{900_000, 999_999}` that governs which UIDs the node +//! hands *out*; this helper reads `[jails] uid_gid_range` from config.toml to +//! decide which UIDs it *accepts* (default `{900_000, 999_999}` when the key is +//! absent). Operators narrowing the range must set **both**. use crate::util::safe_bin::{self, SafeBin}; use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile}; @@ -22,11 +29,56 @@ pub enum LoadingError { Malformed(PathBuf), #[error("work_dir in {0:?} must be an absolute path")] Relative(PathBuf), + #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")] + BadUidGidRange { min: u32, max: u32 }, +} + +/// Error returned by config accessors for tool binaries derived from config. +#[derive(Debug, Error)] +pub enum BinError { + #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")] + Unconfigured(&'static str), + #[error(transparent)] + Bin(#[from] safe_bin::Error), } const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml"; const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH"; +/// UID/GID allocation band, read from `[jails] uid_gid_range` in config.toml as +/// a two-element `[min, max]` array. Controls which UIDs the helper *accepts* +/// from the BEAM — see module docs. +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(from = "[u32; 2]")] +pub struct UidGidRange { + pub min: u32, + pub max: u32, +} + +impl From<[u32; 2]> for UidGidRange { + fn from([min, max]: [u32; 2]) -> Self { + Self { min, max } + } +} + +// Band defaults match Elixir's `compile_env` allocation defaults so that an +// unconfigured helper and an unconfigured node agree out of the box. +const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999); + +/// Validate a uid_gid_range value. A present range where min==0 or min>max is +/// treated as a config trust violation — fatal at load, consistent with the +/// "present but untrusted" model. Exposed so tests can verify the contract +/// without touching the file system. +pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> { + if r.min == 0 || r.min > r.max { + return Err(LoadingError::BadUidGidRange { + min: r.min, + max: r.max, + }); + } + Ok(()) +} + /// The config file path. In production this is the fixed `/etc/hyper/config.toml`. /// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the /// secure arm is always the hardcoded path, so a release build cannot be steered. @@ -42,20 +94,65 @@ fn config_path() -> PathBuf { } /// Hyper's /etc/hyper/config.toml file format. -/// -/// The device-tool paths are read from here (never from the unprivileged -/// caller, which is why there is no `--bin` argument): the helper alone decides -/// which binary it escalates to run. Each defaults to its usual location and is -/// validated as a [`SafeBin`] before use. #[derive(Debug, Clone, Deserialize)] pub struct Config { work_dir: PathBuf, - #[serde(default = "default_dmsetup")] + #[serde(default)] + tools: Tools, + #[serde(default)] + jails: Jails, +} + +/// The `[jails]` table: how the helper places and confines each VM jail. +/// +/// `cgroup` is the parent cgroup the jailer nests every VM beneath (default +/// `"hyper"`). `uid_gid_range` is the `[min, max]` band of UIDs/GIDs the helper +/// accepts from the BEAM; absent means the built-in default. A missing `[jails]` +/// table, or any missing key within it, falls back to these defaults. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Jails { + cgroup: String, + uid_gid_range: Option, +} + +impl Default for Jails { + fn default() -> Self { + Self { + cgroup: default_parent_cgroup(), + uid_gid_range: None, + } + } +} + +/// Paths to the external binaries the helper runs, the `[tools]` table. +/// +/// The device tools (`dmsetup`, `losetup`, `blockdev`) carry built-in defaults; +/// `firecracker` and `jailer` have none and must be configured before any VM can +/// launch — their absence surfaces as [`BinError::Unconfigured`] at use time, not +/// at load. Every path is validated as a root-owned, correctly-named [`SafeBin`] +/// when accessed, never at parse time (the file is read unprivileged). A missing +/// `[tools]` table, or any missing key within it, falls back to these defaults. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Tools { dmsetup: PathBuf, - #[serde(default = "default_losetup")] losetup: PathBuf, - #[serde(default = "default_blockdev")] blockdev: PathBuf, + firecracker: Option, + jailer: Option, +} + +impl Default for Tools { + fn default() -> Self { + Self { + dmsetup: default_dmsetup(), + losetup: default_losetup(), + blockdev: default_blockdev(), + firecracker: None, + jailer: None, + } + } } // The default data root. Must match the Elixir node's `@dev_work_dir`, which it @@ -77,13 +174,17 @@ fn default_blockdev() -> PathBuf { PathBuf::from("/usr/sbin/blockdev") } +fn default_parent_cgroup() -> String { + // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync. + "hyper".into() +} + impl Default for Config { fn default() -> Self { Self { work_dir: default_work_dir(), - dmsetup: default_dmsetup(), - losetup: default_losetup(), - blockdev: default_blockdev(), + tools: Tools::default(), + jails: Jails::default(), } } } @@ -91,7 +192,7 @@ impl Default for Config { impl Config { /// The process-wide config, loaded once (and forced unprivileged by /// [`Config::init`]). An absent file yields the built-in defaults; a - /// *present but untrusted* file (wrong owner/mode, malformed) is fatal - + /// *present but untrusted* file (wrong owner/mode, malformed) is fatal — /// the helper prints the error and exits rather than trusting it. pub fn get() -> &'static Config { LazyLock::force(&CONFIG) @@ -100,7 +201,7 @@ impl Config { /// Force the config to load now. Call this once at the very start of `main`, /// after privileges have already been dropped (the `.preinit_array` entry in /// `setuid_privileged` runs before `main`), so the file is never first read - /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read + /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read /// as the real uid, not as root. pub fn init() { let _ = Self::get(); @@ -118,17 +219,56 @@ impl Config { /// The validated `dmsetup` binary the helper will run. pub fn dmsetup(&self) -> Result, safe_bin::Error> { - SafeBin::from_path(&self.dmsetup) + SafeBin::from_path(&self.tools.dmsetup) } /// The validated `losetup` binary the helper will run. pub fn losetup(&self) -> Result, safe_bin::Error> { - SafeBin::from_path(&self.losetup) + SafeBin::from_path(&self.tools.losetup) } /// The validated `blockdev` binary the helper will run. pub fn blockdev(&self) -> Result, safe_bin::Error> { - SafeBin::from_path(&self.blockdev) + SafeBin::from_path(&self.tools.blockdev) + } + + /// The Firecracker VMM binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set `[tools] firecracker` before any VM can be launched. + pub fn firecracker(&self) -> Result, BinError> { + self.tools + .firecracker + .as_deref() + .ok_or(BinError::Unconfigured("firecracker")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The Firecracker jailer binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set `[tools] jailer` before any VM can be launched. + pub fn jailer(&self) -> Result, BinError> { + self.tools + .jailer + .as_deref() + .ok_or(BinError::Unconfigured("jailer")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The jailer `--parent-cgroup` value, from `[jails] cgroup`. Defaults to + /// `"hyper"`, matching the Elixir node's default. + pub fn parent_cgroup(&self) -> &str { + &self.jails.cgroup + } + + /// The UID/GID band the helper accepts from the BEAM. Defaults to + /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults). + /// A present range with min==0 or min>max is rejected at load time by + /// [`Config::safe_load`], so this accessor is always total. + pub fn uid_gid_range(&self) -> (u32, u32) { + self.jails + .uid_gid_range + .map(|r| (r.min, r.max)) + .unwrap_or(DEFAULT_UID_GID) } /// Read, ownership-check, parse, and validate the config file. See the module @@ -143,7 +283,7 @@ impl Config { Ok(file) => file, // A genuinely-absent file means "use the built-in defaults": those // are compiled into this root-owned binary, so they are trusted. Any - // OTHER failure - a present but wrong-owner/mode file, an I/O error - + // OTHER failure — a present but wrong-owner/mode file, an I/O error — // stays fatal, because it is a signal (someone put an untrusted file // there), not an absence. Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => { @@ -160,6 +300,11 @@ impl Config { if !config.work_dir.is_absolute() { return Err(LoadingError::Relative(path)); } + + if let Some(r) = &config.jails.uid_gid_range { + validate_uid_gid_range(r)?; + } + Ok(config) } } diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs index 617fd6d5..11b08319 100644 --- a/native/suidhelper/src/main.rs +++ b/native/suidhelper/src/main.rs @@ -15,6 +15,7 @@ use clap::{Parser, Subcommand}; use hyper_suidhelper::config; +use hyper_suidhelper::tools::jailer::{self, JailerArgs}; use hyper_suidhelper::tools::Tool; use hyper_suidhelper::util::setuid_privileged::{self, Privileged}; use serde::Serialize; @@ -37,6 +38,9 @@ enum Command { Tool(Tool), /// Check the helper is correctly installed (can promote to root). SysTest, + /// Validate the caller's args, become root, and `execve` the firecracker + /// jailer in our place. Prints nothing on success (the image is replaced). + Jailer(JailerArgs), /// Print the build version and BLAKE3 checksum of this binary. Version, } @@ -101,12 +105,21 @@ fn main() { config::Config::init(); // Each command yields a serializable value (errors stringified to unify); we - // render the final JSON line here. + // render the final JSON line here. The jailer is the exception: it `execve`s + // in place, so on success it never returns and never emits JSON, and on + // failure it reports to stderr and exits before reaching the JSON pipeline. let output = match command { Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()), Command::SysTest => SysTest::perform() .map(Output::SysTest) .map_err(|e| e.to_string()), + Command::Jailer(args) => { + let e = jailer::run(args).expect_err("jailer::run only returns on error"); + eprintln!("{e}"); + // _exit bypasses atexit handlers; safe because we are permanently root + // at this point and must not run any cleanup registered before escalation. + unsafe { nix::libc::_exit(2) } + } Command::Version => unreachable!("handled above"), }; diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs new file mode 100644 index 00000000..ec29ac8a --- /dev/null +++ b/native/suidhelper/src/tools/jailer.rs @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! The `jailer` subcommand: validate the BEAM's arguments, re-acquire root +//! permanently, and `execve` the firecracker jailer in our place. +//! +//! Unlike the device tools this is **not** an [`crate::tools::IsTool`]: it does +//! not run a child and parse JSON, it *becomes* the jailer via `execve`, so the +//! unprivileged BEAM's MuonTrap port keeps supervising the resulting process +//! across the image replacement. There is no output and no return on success. +//! +//! Threat model: the BEAM is untrusted. It supplies only `--id`, `--uid`, +//! `--gid`, repeated `--cgroup KEY=VALUE`, and `--api-sock`. Every privileged +//! path (the jailer binary, the firecracker `--exec-file`, the chroot base, the +//! parent cgroup) comes from the root-owned config, never the caller. The +//! refusal contracts on the newtypes below are the security core: a compromised +//! BEAM must not be able to name a privileged path, request uid 0, traverse out +//! of the chroot base, inject a flag, or smuggle an environment/fd into root. +//! +//! ## Validator laws (property-tested in `tests/tools/jailer.rs`) +//! - [`validate_id_number`] accepts iff `n != 0 && lo <= n <= hi`; 0 is rejected +//! for *every* range (uid 0 makes the jailer skip its privilege drop). +//! - [`VmId`] round-trips exactly the allowed charset/length and rejects any +//! separator, dot, NUL, whitespace, leading dash, empty, or over-long input. +//! - [`CgroupSetting`] re-renders a valid pair to its canonical `key=value` and +//! rejects unknown keys and values outside the per-key grammar. +//! - [`JailSock`] accepts exactly `/` + one filename and rejects multi-component, +//! relative, `..`, and NUL/whitespace inputs. + +use crate::config::{BinError, Config}; +use crate::util::setuid_privileged; +use clap::Args; +use nix::errno::Errno; +use std::ffi::CString; +use std::fmt; +use std::os::unix::ffi::OsStrExt; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use thiserror::Error as ThisError; + +/// The jailer protects at most a handful of controllers; an unbounded `--cgroup` +/// list is a caller trying something. Cap it well above any legitimate need. +const MAX_CGROUPS: usize = 16; + +/// `sun_path` in `sockaddr_un` is 108 bytes on Linux; a socket path longer than +/// that can never be bound, so reject it up front. +const MAX_SOCK_LEN: usize = 108; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("invalid --id {0:?}: must be 1..=64 chars of [A-Za-z0-9_-], not starting with '-'")] + VmId(String), + #[error("invalid --cgroup {0:?}: unknown key or value outside its grammar")] + Cgroup(String), + #[error("invalid --api-sock {0:?}: must be / with name in [A-Za-z0-9_.-]")] + Sock(String), + #[error("--uid/--gid {value} is zero or outside the configured range [{lo}, {hi}]")] + IdNumber { value: u32, lo: u32, hi: u32 }, + #[error("too many --cgroup settings: {0} (max {MAX_CGROUPS})")] + TooManyCgroups(usize), + #[error(transparent)] + Bin(#[from] BinError), + #[error(transparent)] + Privilege(#[from] setuid_privileged::Error), + #[error("argument contains an interior NUL byte")] + NulArgument, + #[error("execve {path:?} failed: {errno}")] + Exec { path: PathBuf, errno: Errno }, +} + +/// `n != 0 && lo <= n <= hi`. uid/gid 0 is rejected unconditionally: a jailer run +/// with uid 0 skips its privilege drop and leaves firecracker running as root. +pub fn validate_id_number(n: u32, range: (u32, u32)) -> Result { + let (lo, hi) = range; + if n != 0 && lo <= n && n <= hi { + Ok(n) + } else { + Err(Error::IdNumber { value: n, lo, hi }) + } +} + +/// A VM id used as a chroot subdirectory name: `[A-Za-z0-9_-]`, length `1..=64`, +/// first character not `-` (so it can never be read as a flag). No `/`, `.`, NUL, +/// or whitespace can appear, so it can never traverse out of the chroot base. +#[derive(Debug, Clone)] +pub struct VmId(String); + +impl FromStr for VmId { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::VmId(s.to_string()); + let bytes = s.as_bytes(); + if !(1..=64).contains(&bytes.len()) { + return Err(reject()); + } + if bytes[0] == b'-' { + return Err(reject()); + } + if !bytes + .iter() + .all(|&b| b.is_ascii_alphanumeric() || b == b'_' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for VmId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +/// `1..=20` ASCII digits — bounds a numeric cgroup limit without pulling in a +/// regex engine. The upper bound comfortably exceeds `u64::MAX`'s 20 digits. +fn is_digits_1_20(s: &str) -> bool { + !s.is_empty() && s.len() <= 20 && s.bytes().all(|b| b.is_ascii_digit()) +} + +/// A single `KEY=VALUE` cgroup setting from an allowlist. The helper re-emits +/// `key=value` itself from the canonical key, so the jailer never sees the +/// caller's raw bytes. Per-key value grammar: +/// - `memory.max` : `[0-9]{1,20}` or the literal `max` +/// - `cpu.max` : `[0-9]{1,20} [0-9]{1,20}` or `max [0-9]{1,20}` +#[derive(Debug, Clone)] +pub struct CgroupSetting { + key: &'static str, + value: String, +} + +impl FromStr for CgroupSetting { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Cgroup(s.to_string()); + // Split on the FIRST `=`. None of the value grammars contains a `=`, so a + // second `=` lands in `value` and is rejected by the grammar check below. + let (raw_key, value) = s.split_once('=').ok_or_else(reject)?; + + let key: &'static str = match raw_key { + "memory.max" => "memory.max", + "cpu.max" => "cpu.max", + _ => return Err(reject()), + }; + + let valid = match key { + "memory.max" => value == "max" || is_digits_1_20(value), + "cpu.max" => match value.split_once(' ') { + Some((quota, period)) => { + (quota == "max" || is_digits_1_20(quota)) && is_digits_1_20(period) + } + None => false, + }, + _ => false, + }; + + if valid { + Ok(Self { + key, + value: value.to_string(), + }) + } else { + Err(reject()) + } + } +} + +impl fmt::Display for CgroupSetting { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}={}", self.key, self.value) + } +} + +/// The firecracker API socket path: an absolute path that is exactly `/` plus one +/// filename in `[A-Za-z0-9_.-]`. The charset excludes `/`, so the value is always +/// a direct child of `/` with no extra components and no traversal; `.`/`..` as +/// the whole filename are rejected explicitly. +#[derive(Debug, Clone)] +pub struct JailSock(String); + +impl FromStr for JailSock { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Sock(s.to_string()); + if s.len() > MAX_SOCK_LEN { + return Err(reject()); + } + let name = s.strip_prefix('/').ok_or_else(reject)?; + if name.is_empty() || name == "." || name == ".." { + return Err(reject()); + } + if !name + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for JailSock { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +#[derive(Args)] +pub struct JailerArgs { + /// Microvm id; becomes the chroot subdirectory name. + #[arg(long)] + id: VmId, + /// Unprivileged uid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + uid: u32, + /// Unprivileged gid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + gid: u32, + /// Repeatable `KEY=VALUE` cgroup setting from the allowlist. + #[arg(long = "cgroup")] + cgroup: Vec, + /// Absolute firecracker API socket path (single filename under `/`). + #[arg(long = "api-sock")] + api_sock: JailSock, +} + +fn cstr_bytes(bytes: &[u8]) -> Result { + CString::new(bytes).map_err(|_| Error::NulArgument) +} + +fn cstr_str(s: &str) -> Result { + cstr_bytes(s.as_bytes()) +} + +fn cstr_path(p: &Path) -> Result { + cstr_bytes(p.as_os_str().as_bytes()) +} + +/// Build the exact argv handed to the jailer. argv[0] is the jailer path. The +/// caller never names the jailer, the `--exec-file`, the chroot base, the cgroup +/// version, or the parent cgroup — those are derived from trusted config here. +#[allow(clippy::too_many_arguments)] +fn build_argv( + jailer: &Path, + id: &VmId, + firecracker: &Path, + uid: u32, + gid: u32, + jail_base: &Path, + parent_cgroup: &str, + cgroups: &[CgroupSetting], + api_sock: &JailSock, +) -> Result, Error> { + let mut argv = vec![ + cstr_path(jailer)?, + cstr_str("--id")?, + cstr_str(&id.to_string())?, + cstr_str("--exec-file")?, + cstr_path(firecracker)?, + cstr_str("--uid")?, + cstr_str(&uid.to_string())?, + cstr_str("--gid")?, + cstr_str(&gid.to_string())?, + cstr_str("--chroot-base-dir")?, + cstr_path(jail_base)?, + cstr_str("--cgroup-version")?, + cstr_str("2")?, + cstr_str("--parent-cgroup")?, + cstr_str(parent_cgroup)?, + ]; + + for cg in cgroups { + argv.push(cstr_str("--cgroup")?); + argv.push(cstr_str(&cg.to_string())?); + } + + argv.push(cstr_str("--")?); + argv.push(cstr_str("--api-sock")?); + argv.push(cstr_str(&api_sock.to_string())?); + + Ok(argv) +} + +/// Close every inherited fd above stdio so a compromised BEAM cannot smuggle an +/// open fd into the root jailer. Keep 0/1/2: MuonTrap supervises the jailer +/// through stdio, and stderr carries our exec-failure message. `close_range(2)` +/// needs Linux 5.9+; on any failure (ENOSYS or otherwise) we fall back to +/// closing each fd individually — fail closed before handing root to the jailer. +fn close_inherited_fds() { + const FIRST: u32 = 3; + // SAFETY: raw syscall with no memory operands; closing fds has no UB. + let rc = unsafe { nix::libc::close_range(FIRST, u32::MAX, 0) }; + if rc == 0 { + return; + } + + // SAFETY: sysconf is a pure query of a system limit. + let max = unsafe { nix::libc::sysconf(nix::libc::_SC_OPEN_MAX) }; + let max = if max < 0 { 4096 } else { max as i32 }; + for fd in (FIRST as i32)..max { + // SAFETY: closing an arbitrary fd is safe; EBADF for unused fds is ignored. + unsafe { + nix::libc::close(fd); + } + } +} + +/// Validate the caller's args, then permanently become root and `execve` the +/// jailer. On success this never returns (the process image is replaced); the +/// `Ok` arm is therefore [`std::convert::Infallible`]. Every failure is returned +/// as [`Error`] for the caller to print and exit non-zero. +pub fn run(args: JailerArgs) -> Result { + let config = Config::get(); + + // Resolve everything that can fail as the REAL uid, before any privilege is + // raised: config accessors, binary validation, range, and arg validation. + let jailer: PathBuf = config.jailer()?.into(); + let firecracker: PathBuf = config.firecracker()?.into(); + let jail_base = config.jail_base(); + let parent_cgroup = config.parent_cgroup(); + let range = config.uid_gid_range(); + + let uid = validate_id_number(args.uid, range)?; + let gid = validate_id_number(args.gid, range)?; + + if args.cgroup.len() > MAX_CGROUPS { + return Err(Error::TooManyCgroups(args.cgroup.len())); + } + + let argv = build_argv( + &jailer, + &args.id, + &firecracker, + uid, + gid, + &jail_base, + parent_cgroup, + &args.cgroup, + &args.api_sock, + )?; + let jailer_cstr = cstr_path(&jailer)?; + + // Point of no return: from here we are permanently root and must execve. + setuid_privileged::become_root_permanently()?; + close_inherited_fds(); + + // Empty envp: once ruid==0 the kernel clears AT_SECURE, so a smuggled + // LD_PRELOAD/LD_LIBRARY_PATH would be honored by the dynamic loader and + // hijack the root jailer. We pass nothing and let the jailer build its own. + let empty_env: [CString; 0] = []; + let errno = nix::unistd::execve(&jailer_cstr, &argv, &empty_env) + .expect_err("execve only returns on failure"); + Err(Error::Exec { + path: jailer, + errno, + }) +} diff --git a/native/suidhelper/src/tools/mod.rs b/native/suidhelper/src/tools/mod.rs index 24a2be42..9f9fedae 100644 --- a/native/suidhelper/src/tools/mod.rs +++ b/native/suidhelper/src/tools/mod.rs @@ -7,6 +7,7 @@ mod blockdev; pub mod chroot_jail; mod dmsetup; +pub mod jailer; mod losetup; pub use blockdev::{Blockdev, BlockdevArgs}; diff --git a/native/suidhelper/src/util/setuid_privileged.rs b/native/suidhelper/src/util/setuid_privileged.rs index 3c58bd39..e334b115 100644 --- a/native/suidhelper/src/util/setuid_privileged.rs +++ b/native/suidhelper/src/util/setuid_privileged.rs @@ -12,17 +12,20 @@ //! constructor nor a Drop impl can report an error, and silently staying root //! would defeat the point. -use nix::unistd::{getuid, seteuid, Uid}; +use nix::unistd::{getuid, seteuid, setgroups, setresgid, setresuid, Gid, Uid}; use thiserror::Error as ThisError; -/// Failures of the privilege transitions. Both are fatal: if we can't raise we -/// aren't installed setuid root, and if we can't lower we refuse to keep running. +/// Failures of the privilege transitions. All fatal: if we can't raise we aren't +/// installed setuid root, if we can't lower we refuse to keep running, and if we +/// can't seal permanent root we refuse to hand off to the execve target. #[derive(Debug, ThisError)] pub enum Error { #[error("not installed setuid root")] NotSetuidRoot, #[error("failed to drop privileges")] DropPrivileges, + #[error("failed to acquire permanent root for execve handoff")] + PermanentRoot, } /// `.preinit_array` runs before `.init_array` and before any shared-library @@ -76,6 +79,36 @@ impl Privileged { } } +/// Re-acquire full root **permanently** for an `execve` handoff and return — +/// there is deliberately no [`Privileged`] Drop guard, because `execve` replaces +/// the entire process image: nothing of this process survives to run a destructor, +/// and the new image (the firecracker jailer) is responsible for dropping its own +/// privileges. We must hand it a *genuine* root process (all of real, effective +/// and saved uid == 0) so the jailer's own privilege-drop is the real thing. +/// +/// Order matters and each step needs the privilege the previous one preserves: +/// 1. `seteuid(0)` — regain effective root; without it the rest are EPERM. +/// 2. `setresgid(0,0,0)` — set every gid to root *before* touching uids, while +/// we still hold euid 0 (gid changes require privilege). +/// 3. `setgroups([0])` — `drop_to_real` only lowered euid; it never touched the +/// caller's supplementary groups, so the jailer would otherwise inherit them. +/// Reset to just {0} now, while still privileged. +/// 4. `setresuid(0,0,0)` LAST — this seals real+effective+saved uid to root. It +/// goes last because once the saved uid is 0 there is no escape hatch left +/// (which is the point: permanent), and because it must follow the gid/group +/// changes that needed our euid-0 to be permitted. +pub fn become_root_permanently() -> Result<(), Error> { + let root_uid = Uid::from_raw(0); + let root_gid = Gid::from_raw(0); + + seteuid(root_uid).map_err(|_| Error::NotSetuidRoot)?; + setresgid(root_gid, root_gid, root_gid).map_err(|_| Error::PermanentRoot)?; + setgroups(&[root_gid]).map_err(|_| Error::PermanentRoot)?; + setresuid(root_uid, root_uid, root_uid).map_err(|_| Error::PermanentRoot)?; + + Ok(()) +} + impl Drop for Privileged { fn drop(&mut self) { if lower().is_err() { diff --git a/native/suidhelper/tests/config_uid_gid_range.rs b/native/suidhelper/tests/config_uid_gid_range.rs new file mode 100644 index 00000000..1a6f4647 --- /dev/null +++ b/native/suidhelper/tests/config_uid_gid_range.rs @@ -0,0 +1,59 @@ +//! Properties of the `uid_gid_range` configuration field. +//! +//! Contracts under test: +//! - A valid range (min >= 1, min <= max) is always accepted. +//! - Absent range yields the built-in default (900_000, 999_999). +//! - A valid range round-trips through TOML deserialization + uid_gid_range(). +//! - min == 0 is always rejected (uid 0 means root; the jailer must never +//! receive it — it skips its privilege drop when uid == 0). +//! - min > max is always rejected (incoherent range; likely a config typo). + +use hyper_suidhelper::config::{validate_uid_gid_range, Config, LoadingError, UidGidRange}; +use proptest::prelude::*; + +#[test] +fn absent_range_yields_default() { + assert_eq!(Config::default().uid_gid_range(), (900_000, 999_999)); +} + +proptest! { + #[test] + fn valid_range_accepted(min in 1u32.., delta in 0u32..) { + // max = min + delta, saturating so it never wraps past u32::MAX. + let max = min.saturating_add(delta); + let r = UidGidRange { min, max }; + prop_assert!(validate_uid_gid_range(&r).is_ok()); + } + + #[test] + fn valid_range_round_trips_via_toml(min in 1u32.., delta in 0u32..) { + let max = min.saturating_add(delta); + let body = format!( + "work_dir = \"/srv/hyper\"\n[jails]\nuid_gid_range = [{min}, {max}]\n" + ); + let config: Config = toml::from_str(&body).expect("valid TOML"); + prop_assert_eq!(config.uid_gid_range(), (min, max)); + } + + #[test] + fn zero_min_always_rejected(max in 0u32..) { + let r = UidGidRange { min: 0, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { min: 0, .. }) + ); + prop_assert!(rejected); + } + + #[test] + fn min_exceeds_max_always_rejected(max in 0u32..u32::MAX) { + // min = max + 1 is always strictly greater than max and always >= 1. + let min = max + 1; + let r = UidGidRange { min, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { .. }) + ); + prop_assert!(rejected); + } +} diff --git a/native/suidhelper/tests/e2e/argv.rs b/native/suidhelper/tests/e2e/argv.rs index 519d664e..25c5981a 100644 --- a/native/suidhelper/tests/e2e/argv.rs +++ b/native/suidhelper/tests/e2e/argv.rs @@ -36,7 +36,8 @@ fn install_fake(dir: &Path, basename: &str, record: &Path, stdout_line: &str) -> /// caller argument. fn write_root_config(dir: &Path, bins: &[(&str, &Path)]) -> PathBuf { let p = dir.join("config.toml"); - let mut body = String::from("work_dir = \"/srv/hyper\"\n"); + // Every key here is a tool name, so they live under the `[tools]` table. + let mut body = String::from("work_dir = \"/srv/hyper\"\n[tools]\n"); for (key, path) in bins { body.push_str(&format!("{key} = \"{}\"\n", path.display())); } @@ -174,6 +175,71 @@ fn dmsetup_targets_argv_and_parse_as_root() { assert_eq!(json["output"], "snapshot v1.16.0\n"); } +#[test] +fn dmsetup_ls_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_ls: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake(tmp.path(), "dmsetup", &rec, "hyper-thinpool\\nhyper-rw-abc"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "ls"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["ls"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!(json["output"], "hyper-thinpool\nhyper-rw-abc\n"); +} + +#[test] +fn losetup_list_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP losetup_list: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake( + tmp.path(), + "losetup", + &rec, + "/dev/loop0 /srv/hyper/scratch/thinpool.meta", + ); + let cfg = write_root_config(tmp.path(), &[("losetup", &bin)]); + + let out = run(&cfg, &["losetup", "list"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!( + recorded_argv(&rec), + vec![ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE" + ] + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!( + json["output"], + "/dev/loop0 /srv/hyper/scratch/thinpool.meta\n" + ); +} + #[test] fn dmsetup_rejects_configured_bin_with_wrong_basename_as_root() { if !is_root() { diff --git a/native/suidhelper/tests/e2e/config.rs b/native/suidhelper/tests/e2e/config.rs index 100a686a..ea36b4e9 100644 --- a/native/suidhelper/tests/e2e/config.rs +++ b/native/suidhelper/tests/e2e/config.rs @@ -3,6 +3,8 @@ //! tempfile instead of /etc/hyper, so tests never touch the real host. #![cfg(feature = "insecure_test_seams")] +use hyper_suidhelper::config::{BinError, Config}; +use hyper_suidhelper::util::safe_bin; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -126,3 +128,95 @@ fn valid_config_and_setuid_yields_sys_test_ok_as_root() { assert_eq!(json["sys_test"], "ok"); assert_eq!(json["hyper_base"], "/srv/hyper"); } + +#[test] +fn firecracker_unconfigured_when_absent() { + // Config::default() has firecracker == None; the accessor must signal this + // distinctly so callers can report a missing-configuration error rather than + // a safe_bin validation error. + let err = Config::default() + .firecracker() + .expect_err("absent firecracker must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("firecracker")), + "expected Unconfigured(\"firecracker\"), got {err:?}", + ); +} + +#[test] +fn jailer_unconfigured_when_absent() { + let err = Config::default() + .jailer() + .expect_err("absent jailer must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("jailer")), + "expected Unconfigured(\"jailer\"), got {err:?}", + ); +} + +#[test] +fn jailer_basename_mismatch_rejected() { + // The basename check in SafeBin::from_path precedes the stat, so we do not + // need a real file — any absolute path with the wrong leaf name is enough. + let body = "work_dir = \"/srv/hyper\"\n[tools]\njailer = \"/usr/local/bin/not-jailer\"\n"; + let config: Config = toml::from_str(body).unwrap(); + let err = config + .jailer() + .expect_err("wrong-basename jailer path must be rejected"); + assert!( + matches!(err, BinError::Bin(safe_bin::Error::Name { .. })), + "expected a Name error, got {err:?}", + ); +} + +#[test] +fn firecracker_and_jailer_return_ok_when_root_owned_as_root() { + if !is_root() { + eprintln!("SKIP firecracker_jailer_configured: needs root to create root-owned binaries"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let fc = tmp.path().join("firecracker"); + let jr = tmp.path().join("jailer"); + // 0o755: root-owned, not group/other-writable — satisfies SafeBin's checks. + for p in [&fc, &jr] { + fs::write(p, b"#!/bin/sh\n").unwrap(); + fs::set_permissions(p, fs::Permissions::from_mode(0o755)).unwrap(); + } + let body = format!( + "work_dir = \"/srv/hyper\"\n[tools]\nfirecracker = \"{}\"\njailer = \"{}\"\n", + fc.display(), + jr.display(), + ); + let config: Config = toml::from_str(&body).unwrap(); + assert!( + config.firecracker().is_ok(), + "root-owned firecracker with correct basename must be accepted" + ); + assert!( + config.jailer().is_ok(), + "root-owned jailer with correct basename must be accepted" + ); +} + +#[test] +fn bad_uid_gid_range_exits_2_as_root() { + if !is_root() { + eprintln!("SKIP bad_uid_gid_range: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // min = 0 is the clearest violation: uid 0 is root, which the jailer must + // never receive because it skips its privilege drop when uid == 0. + let p = write_root_config( + tmp.path(), + "work_dir = \"/srv/hyper\"\n[jails]\nuid_gid_range = [0, 100]\n", + ); + let out = run_with_config(&p, &["sys-test"]); + assert_eq!(out.status.code(), Some(2)); + let err = String::from_utf8_lossy(&out.stderr); + assert!( + err.contains("uid_gid_range"), + "expected a uid_gid_range error in stderr, got: {err}", + ); +} diff --git a/native/suidhelper/tests/e2e/jailer.rs b/native/suidhelper/tests/e2e/jailer.rs new file mode 100644 index 00000000..ba45468c --- /dev/null +++ b/native/suidhelper/tests/e2e/jailer.rs @@ -0,0 +1,209 @@ +//! L4 for the jailer handoff: prove the exact argv (and an EMPTY environment) the +//! helper hands to the jailer via `execve`, and that the jailer's exit status +//! propagates. We point the config's `jailer` at a root-owned recorder that +//! writes its argv and its `/proc/self/environ` to files, then exits with a known +//! code. Root-only: `become_root_permanently` requires we are already root. +#![cfg(feature = "insecure_test_seams")] + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::process::Command; + +const HELPER: &str = env!("CARGO_BIN_EXE_hyper-suidhelper"); +const RECORDER_EXIT: i32 = 7; + +fn is_root() -> bool { + nix::unistd::geteuid().is_root() +} + +fn cat_bin() -> &'static str { + ["/bin/cat", "/usr/bin/cat"] + .into_iter() + .find(|p| Path::new(p).exists()) + .expect("a `cat` binary for the recorder") +} + +/// Install a root-owned recorder named `jailer` that writes its argv (minus +/// argv[0]) as a JSON array to `argv_rec` and copies its `/proc/self/environ` to +/// `env_rec`, then exits `RECORDER_EXIT`. Paths are baked into the script text +/// because the recorder runs with an empty environment and absolute `cat` so it +/// needs no `PATH`. Returns the recorder's absolute path. +fn install_recorder(dir: &Path, argv_rec: &Path, env_rec: &Path) -> PathBuf { + let path = dir.join("jailer"); + let script = format!( + "#!/bin/sh\n\ + (\n printf '['\n sep=''\n for a in \"$@\"; do printf '%s\"%s\"' \"$sep\" \"$a\"; sep=','; done\n printf ']'\n) > '{argv}'\n\ + {cat} /proc/self/environ > '{env}'\n\ + exit {code}\n", + argv = argv_rec.display(), + cat = cat_bin(), + env = env_rec.display(), + code = RECORDER_EXIT, + ); + fs::write(&path, script).unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o755)).unwrap(); + path +} + +/// A root-owned plain file with basename `firecracker` — the `--exec-file`. It is +/// never executed by us (the jailer would), only validated as a `SafeBin`. +fn install_firecracker(dir: &Path) -> PathBuf { + let path = dir.join("firecracker"); + fs::write(&path, b"#!/bin/true\n").unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap(); + path +} + +fn write_root_config(dir: &Path, jailer: &Path, firecracker: &Path) -> PathBuf { + let p = dir.join("config.toml"); + let body = format!( + "work_dir = \"/srv/hyper\"\n[tools]\njailer = \"{}\"\nfirecracker = \"{}\"\n", + jailer.display(), + firecracker.display(), + ); + fs::write(&p, body).unwrap(); + fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); + p +} + +fn run(config: &Path, args: &[&str]) -> std::process::Output { + Command::new(HELPER) + .args(args) + .env_clear() + .env("HYPER_SETUIDHELPER_IS_INSECURE_MODE", "1") + .env("HYPER_SETUIDHELPER_CONFIG_PATH", config) + .output() + .expect("spawn helper") +} + +#[test] +fn execs_jailer_with_canonical_argv_and_empty_env_as_root() { + if !is_root() { + eprintln!("SKIP jailer exec: needs root to become_root_permanently + own the fakes"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "900001", + "--gid", + "900002", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--api-sock", + "/api.sock", + ], + ); + + // The jailer's own exit status must propagate through the execve handoff. + assert_eq!( + out.status.code(), + Some(RECORDER_EXIT), + "exit status did not propagate; stderr: {}", + String::from_utf8_lossy(&out.stderr), + ); + + let argv: Vec = + serde_json::from_str(&fs::read_to_string(&argv_rec).expect("recorded argv")).unwrap(); + assert_eq!( + argv, + vec![ + "--id", + "vm1", + "--exec-file", + &firecracker.to_string_lossy(), + "--uid", + "900001", + "--gid", + "900002", + "--chroot-base-dir", + "/srv/hyper/jails", + "--cgroup-version", + "2", + "--parent-cgroup", + "hyper", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--", + "--api-sock", + "/api.sock", + ], + "helper handed the jailer a non-canonical argv", + ); + + // The helper execve's the jailer with an EMPTY envp (see src/tools/jailer.rs): + // once ruid==0 a smuggled LD_PRELOAD would be honored, so nothing of the + // caller's environment may reach the root jailer. The recorder is a `/bin/sh` + // script and the shell *self-sets* `PWD` (and, under bash, `_`/`SHLVL`) on + // startup, so a literally-empty environ is impossible to observe through it. + // We instead prove no CALLER variable survives: the helper is spawned with + // `HYPER_*` config vars in its own environment (see `run`), so their absence + // here is the leak canary - had the helper passed its environment through, + // they would appear alongside `PWD`. + let environ = fs::read(&env_rec).expect("recorded environ"); + let leaked: Vec = environ + .split(|&b| b == 0) + .filter(|entry| !entry.is_empty()) + .map(|entry| String::from_utf8_lossy(entry).into_owned()) + .filter(|entry| { + let key = entry.split('=').next().unwrap_or(""); + !matches!(key, "PWD" | "_" | "SHLVL") + }) + .collect(); + assert!( + leaked.is_empty(), + "caller environment leaked to the jailer (only shell-set PWD/_/SHLVL allowed): {leaked:?}", + ); +} + +#[test] +fn refuses_uid_zero_without_exec_as_root() { + if !is_root() { + eprintln!("SKIP jailer uid 0: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "0", + "--gid", + "900002", + "--api-sock", + "/api.sock", + ], + ); + + assert_ne!(out.status.code(), Some(0), "uid 0 must be refused"); + assert_eq!(out.status.code(), Some(2), "validation failure exits 2"); + assert!( + !argv_rec.exists(), + "the jailer must never have been exec'd for uid 0", + ); +} diff --git a/native/suidhelper/tests/tools/jailer.rs b/native/suidhelper/tests/tools/jailer.rs new file mode 100644 index 00000000..9ab75447 --- /dev/null +++ b/native/suidhelper/tests/tools/jailer.rs @@ -0,0 +1,162 @@ +//! Refusal contracts for the jailer's pure validators — the security core. A +//! valid input must round-trip to its canonical form; an invalid one must +//! *always* be rejected, never silently accepted. These properties are what stop +//! a compromised BEAM from naming uid 0, a privileged path, a traversal, or a +//! flag through the jailer subcommand. + +use hyper_suidhelper::tools::jailer::{validate_id_number, CgroupSetting, JailSock, VmId}; +use proptest::prelude::*; +use std::str::FromStr; + +proptest! { + /// uid/gid 0 is rejected for EVERY range — a jailer run with uid 0 skips its + /// privilege drop and leaves firecracker running as root. + #[test] + fn id_zero_always_rejected(lo in any::(), span in any::()) { + let hi = lo.saturating_add(span); + prop_assert!(validate_id_number(0, (lo, hi)).is_err()); + } + + /// Any nonzero value inside the (nonempty) range is accepted unchanged. + #[test] + fn id_in_range_nonzero_accepted( + lo in 1u32..=1_000_000, + span in 0u32..=1_000_000, + off in 0u32..=1_000_000, + ) { + let hi = lo + span; + let n = lo + (off % (span + 1)); // off bounded into [0, span] => n in [lo, hi] + prop_assert_eq!(validate_id_number(n, (lo, hi)).ok(), Some(n)); + } + + /// Values just below `lo` (still nonzero) and just above `hi` are rejected. + #[test] + fn id_out_of_range_rejected(lo in 2u32..=1_000_000, span in 0u32..=1_000_000) { + let hi = lo + span; + prop_assert!(validate_id_number(lo - 1, (lo, hi)).is_err()); + if hi < u32::MAX { + prop_assert!(validate_id_number(hi + 1, (lo, hi)).is_err()); + } + } + + /// Every string over the allowed charset/length with a non-dash leader parses + /// and renders back to itself. + #[test] + fn vmid_valid_round_trips(s in "[A-Za-z0-9_][A-Za-z0-9_-]{0,63}") { + prop_assert_eq!(VmId::from_str(&s).unwrap().to_string(), s); + } + + /// A leading dash is always rejected (no flag injection via `--id`). + #[test] + fn vmid_leading_dash_rejected(s in "-[A-Za-z0-9_-]{0,63}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Any embedded path separator is rejected (no chroot traversal via `--id`). + #[test] + fn vmid_slash_rejected(s in "[A-Za-z0-9_]{0,10}/[A-Za-z0-9_]{0,10}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Over-length ids (>64) are rejected. + #[test] + fn vmid_too_long_rejected(s in "[A-Za-z][A-Za-z0-9_-]{64,90}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// A valid `memory.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_memory_round_trips(s in "memory[.]max=([0-9]{1,20}|max)") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A valid `cpu.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_cpu_round_trips(s in "cpu[.]max=([0-9]{1,20} [0-9]{1,20}|max [0-9]{1,20})") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A single-filename absolute socket path round-trips; `.`/`..` filenames are + /// rejected even though they are within the charset. + #[test] + fn jailsock_single_filename(name in "[A-Za-z0-9_.-]{1,40}") { + let s = format!("/{name}"); + let res = JailSock::from_str(&s); + if name == "." || name == ".." { + prop_assert!(res.is_err()); + } else { + prop_assert_eq!(res.unwrap().to_string(), s); + } + } + + /// A second path component is always rejected (the socket must be a direct + /// child of `/`). + #[test] + fn jailsock_multi_component_rejected( + a in "[A-Za-z0-9_]{1,10}", + b in "[A-Za-z0-9_]{1,10}", + ) { + let s = format!("/{a}/{b}"); + prop_assert!(JailSock::from_str(&s).is_err()); + } +} + +#[test] +fn vmid_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "-leading", // leading dash + "a/b", // separator + "a.b", // dot + "a b", // whitespace + "a\tb", // tab + "a\0b", // NUL + "naïve", // non-ascii + &"x".repeat(65), // too long + ] { + assert!(VmId::from_str(bad).is_err(), "VmId accepted {bad:?}"); + } +} + +#[test] +fn cgroup_rejects_known_bad_shapes() { + for bad in [ + "linear=10", // unknown key + "memory.high=10", // unknown key + "memory.max=", // empty value + "memory.max=12x", // non-digit + "memory.max=1=2", // second '=' + "memory.max", // no '=' + "cpu.max=100000", // missing period field + "cpu.max=100000 100000 5", // extra field + "cpu.max=x 100000", // bad quota + "cpu.max=max max", // bad period + "cpu.max=max", // missing period + ] { + assert!( + CgroupSetting::from_str(bad).is_err(), + "CgroupSetting accepted {bad:?}" + ); + } +} + +#[test] +fn jailsock_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "relative", // not absolute + "/", // no filename + "/a/b", // multi-component + "/../etc", // traversal + "/..", // traversal as whole filename + "/.", // current dir + "/a b", // whitespace + "/a\0b", // NUL + "//x", // empty leading component + ] { + assert!( + JailSock::from_str(bad).is_err(), + "JailSock accepted {bad:?}" + ); + } +} diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs new file mode 100644 index 00000000..d2fff4a2 --- /dev/null +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -0,0 +1,83 @@ +defmodule Hyper.Node.FireVMM.JailerTest do + @moduledoc """ + Examples for `Hyper.Node.FireVMM.Jailer.command/1`. + + Load-bearing invariant: the BEAM must never place a privileged binary path + (firecracker, jailer) or lifecycle flags owned by the suidhelper (`--exec-file`, + `--chroot-base-dir`, `--cgroup-version`, `--parent-cgroup`, `--`) in the args + it hands to the helper. The helper derives those from its trusted config. + """ + + use ExUnit.Case, async: false + + alias Hyper.Node.FireVMM + alias Hyper.Node.FireVMM.Jailer + + @vm_id "vmtest01" + + # Stub the cached TOML so firecracker/jailer resolve to dummy paths without + # requiring /etc/hyper/config.toml on the test host. async: false because the + # cache is global state. + setup do + Hyper.Cfg.Toml.put_cache(%{ + "tools" => %{ + "firecracker" => "/usr/local/bin/firecracker", + "jailer" => "/usr/local/bin/jailer" + } + }) + + on_exit(fn -> Hyper.Cfg.Toml.reload() end) + end + + defp micro_opts do + %FireVMM.Opts{ + vm_id: @vm_id, + uid: 900_001, + gid: 900_001, + type: :micro, + arch: :x86_64, + mutable: nil, + kernel: "/srv/hyper/redist/vmlinux/vmlinux-x86_64-6.1", + boot_args: nil + } + end + + test "binary is the suid helper" do + assert Jailer.command(micro_opts()).binary == Hyper.Cfg.Tools.suidhelper() + end + + test "args start with the jailer subcommand" do + %{args: [first | _]} = Jailer.command(micro_opts()) + assert first == "jailer" + end + + test "args contain --id, --uid, --gid with the opts values" do + %{args: args} = Jailer.command(micro_opts()) + assert "--id" in args + assert @vm_id in args + assert "--uid" in args + assert "--gid" in args + assert "900001" in args + end + + test "args end with --api-sock /api.socket" do + %{args: args} = Jailer.command(micro_opts()) + assert Enum.take(args, -2) == ["--api-sock", "/api.socket"] + end + + test "args do not contain privileged flags owned by the suidhelper" do + %{args: args} = Jailer.command(micro_opts()) + refute "--exec-file" in args + refute "--chroot-base-dir" in args + refute "--cgroup-version" in args + refute "--parent-cgroup" in args + refute "--" in args + end + + test "args include --cgroup cpu.max and memory.max for :micro type" do + %{args: args} = Jailer.command(micro_opts()) + assert "--cgroup" in args + assert Enum.any?(args, &String.starts_with?(&1, "cpu.max=")) + assert Enum.any?(args, &String.starts_with?(&1, "memory.max=")) + end +end From aca09b52fb178c254ee52510e83f114d3f563c67 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 30 Jun 2026 21:02:16 +0000 Subject: [PATCH 2/2] chore(suidhelper): mark jailer.rs as slop (TODO) --- native/suidhelper/src/tools/jailer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs index ec29ac8a..bef1dc7b 100644 --- a/native/suidhelper/src/tools/jailer.rs +++ b/native/suidhelper/src/tools/jailer.rs @@ -1,4 +1,5 @@ // SPDX-License-Identifier: AGPL-3.0-only +// TODO: slop //! The `jailer` subcommand: validate the BEAM's arguments, re-acquire root //! permanently, and `execve` the firecracker jailer in our place. //!