Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions native/suidhelper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ path = "tests/util/confinement.rs"
name = "tools_chroot_jail_remove"
path = "tests/tools/chroot_jail_remove.rs"

[[test]]
name = "tools_chroot_jail_grant_api"
path = "tests/tools/chroot_jail_grant_api.rs"

[[test]]
name = "util_chroot_jail"
path = "tests/util/chroot_jail.rs"
Expand Down
186 changes: 186 additions & 0 deletions native/suidhelper/src/tools/chroot_jail/grant_api.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// SPDX-License-Identifier: AGPL-3.0-only
//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so
//! the unprivileged controller can `connect()` to it.
//!
//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker
//! then creates its API socket at `<jail>/root/api.socket` owned by that per-VM
//! id. Connecting a unix socket needs *write* permission on the node, so the
//! node user (a different uid) gets `EACCES`. This op chowns just that one
//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the
//! privileged scope are the real (caller) ids while euid is 0 — and chmods it
//! `0660`. The node thus connects as owner, and humans added to the node's
//! group connect via the group bit.
//!
//! That alone is not enough: the jailer leaves `<id>/root` as `0700` owned by
//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so
//! the node cannot even traverse into `root` to reach the (now its own) socket.
//! So this op also opens just that one directory to the caller's group: it keeps
//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the
//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not
//! list), other none. Per-VM isolation is otherwise untouched: only this socket
//! and its immediate parent's group/mode move, nothing else in the jail, and
//! unrelated users stay locked out.
//!
//! Security: the socket path is validated as a `SafePath` and reached by an
//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect
//! the chown outside the jail, and every op is fd-relative on the pinned `root`
//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket`
//! `<exec>/<id>/root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must
//! report a *socket* — a regular file or symlink planted at that name is an
//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on
//! the path) is `Pending`, not an error: firecracker has not created it yet, so
//! the controller keeps probing.

use crate::config::Config;
use crate::tools::IsTool;
use crate::util::safe_dir::{self, SafeDir};
use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents};
use clap::Args;
use nix::errno::Errno;
use nix::sys::stat::SFlag;
use nix::unistd::{getgid, getuid};
use serde::Serialize;
use std::path::{Path, PathBuf};
use thiserror::Error as ThisError;

/// The fixed in-jail socket name firecracker opens (mirrors the Elixir
/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`).
const SOCKET_NAME: &str = "api.socket";

/// The socket sits at `<JAIL_BASE>/<exec>/<id>/root/api.socket`: three parent
/// components (`<exec>`, `<id>`, `root`) before the leaf.
const SOCKET_PARENT_DEPTH: usize = 3;

/// Mode handed to the node: owner+group read/write, no world access.
const SOCKET_MODE: u32 = 0o660;

/// Mode set on the jail `root` dir so the node's group can *traverse* it to
/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x`
/// (traverse, not list), other none.
const JAIL_ROOT_MODE: u32 = 0o710;

type LexicalPath = SafePath<IsAbsolute, StrictComponents>;

#[derive(Debug, ThisError)]
pub enum Error {
#[error("--socket path: {0}")]
SocketPath(#[source] safe_path::ValidationError),
#[error("--socket must be exactly <exec>/<id>/root/api.socket below JAIL_BASE: {0:?}")]
SocketShape(PathBuf),
#[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")]
SocketName(PathBuf),
#[error("walking to the jail root: {0}")]
Walk(#[source] safe_dir::Error),
#[error("api.socket is not a socket (or is a symlink); refusing to touch it")]
NotASocket,
#[error("statting the socket: {0}")]
Stat(#[source] safe_dir::Error),
#[error("chowning the socket to the caller: {0}")]
Chown(#[source] safe_dir::Error),
#[error("chmoding the socket: {0}")]
Chmod(#[source] safe_dir::Error),
#[error("chgrp-ing the jail root dir to the caller: {0}")]
ChgrpRoot(#[source] safe_dir::Error),
#[error("chmoding the jail root dir for traversal: {0}")]
ChmodRoot(#[source] safe_dir::Error),
}

#[derive(Args)]
pub struct GrantApiArgs {
/// Host path of the firecracker API socket, shape
/// <JAIL_BASE>/<exec>/<id>/root/api.socket.
#[arg(long)]
socket: PathBuf,
}

#[derive(Debug, Serialize)]
#[serde(tag = "result", rename_all = "snake_case")]
pub enum GrantOut {
/// The socket was handed to the caller (chowned + chmoded).
Granted,
/// The socket does not exist yet; the caller should keep waiting.
Pending,
}

/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`).
pub fn run(args: GrantApiArgs) -> Result<serde_json::Value, crate::tools::Error> {
GrantApi { args }.run()
}

struct GrantApi {
args: GrantApiArgs,
}

impl IsTool for GrantApi {
type Args = GrantApiArgs;
type Output = GrantOut;
type RunT = Result<GrantOut, Error>;

fn run_privileged(&self) -> Self::RunT {
grant_api_under(&Config::get().jail_base(), &self.args.socket)
}

fn parse(&self, res: Self::RunT) -> Result<GrantOut, Box<dyn std::error::Error>> {
Ok(res?)
}
}

/// Hand `socket` (`<jail_base>/<exec>/<id>/root/api.socket`) to the helper's
/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns
/// `Pending` if any path component or the socket itself is not yet present.
pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result<GrantOut, Error> {
let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?;
let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?;
if parents.len() != SOCKET_PARENT_DEPTH {
return Err(Error::SocketShape(socket.to_path_buf()));
}
if leaf != Path::new(SOCKET_NAME) {
return Err(Error::SocketName(socket.to_path_buf()));
}

let Some(root) = walk(jail_base.to_path_buf(), &parents)? else {
return Ok(GrantOut::Pending); // jail not fully created yet
};

let leaf = Path::new(SOCKET_NAME);
let stat = match root.stat(leaf) {
Ok(stat) => stat,
Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending),
Err(e) => return Err(Error::Stat(e)),
};
// `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails
// this check too: only a real socket is accepted, anything else is refused.
if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() {
return Err(Error::NotASocket);
}

root.chown(leaf, getuid().as_raw(), getgid().as_raw())
.map_err(Error::Chown)?;
root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?;

// Open `root` itself to the caller's group so the node can traverse into it
// to reach the socket (the jailer leaves it 0700 / per-VM uid). Owner stays
// the per-VM uid; only the group and mode move. Operate on the pinned `root`
// fd (already opened `O_NOFOLLOW`), never by name - TOCTOU-safe.
root.chgrp_self(getgid().as_raw())
.map_err(Error::ChgrpRoot)?;
root.chmod_self(JAIL_ROOT_MODE).map_err(Error::ChmodRoot)?;
Ok(GrantOut::Granted)
}

/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns
/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the
/// caller can treat a half-built jail as `Pending` rather than an error.
fn walk(base: PathBuf, parents: &[PathBuf]) -> Result<Option<SafeDir>, Error> {
let base_path: LexicalPath = base.try_into().map_err(Error::SocketPath)?;
let anchor = match SafeDir::open(&base_path) {
Ok(dir) => dir,
Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(None),
Err(e) => return Err(Error::Walk(e)),
};
match anchor.descend(parents) {
Ok(dir) => Ok(Some(dir)),
Err(e) if e.errno() == Some(Errno::ENOENT) => Ok(None),
Err(e) => Err(Error::Walk(e)),
}
}
5 changes: 5 additions & 0 deletions native/suidhelper/src/tools/chroot_jail/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// SPDX-License-Identifier: AGPL-3.0-only
//! `chroot-jail`: per-VM chroot/jail lifecycle.

pub mod grant_api;
mod prepare;
pub mod remove;

pub use grant_api::GrantApiArgs;
pub use prepare::PrepareArgs;
pub use remove::RemoveArgs;

Expand All @@ -15,6 +17,8 @@ pub enum ChrootJailOp {
Prepare(PrepareArgs),
/// Remove a VM's stale chroot and cgroup leaf before relaunching the jailer.
Remove(RemoveArgs),
/// Hand the firecracker API socket to the node user (chown to caller, 0660).
GrantApi(GrantApiArgs),
}

impl ChrootJailOp {
Expand All @@ -25,6 +29,7 @@ impl ChrootJailOp {
match self {
ChrootJailOp::Prepare(args) => prepare::run(args),
ChrootJailOp::Remove(args) => remove::run(args),
ChrootJailOp::GrantApi(args) => grant_api::run(args),
}
}
}
59 changes: 57 additions & 2 deletions native/suidhelper/src/util/safe_dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ use super::safe_path::SafePath;
use nix::dir::{Dir, Type};
use nix::fcntl::{openat, AtFlags, OFlag};
use nix::libc::dev_t;
use nix::sys::stat::{mknodat, Mode, SFlag};
use nix::unistd::{dup, fchownat, linkat, unlinkat, write, Gid, Uid, UnlinkatFlags};
use nix::sys::stat::{fchmod, fchmodat, fstatat, mknodat, FchmodatFlags, FileStat, Mode, SFlag};
use nix::unistd::{dup, fchown, fchownat, linkat, unlinkat, write, Gid, Uid, UnlinkatFlags};
use std::ffi::OsStr;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd};
Expand All @@ -39,6 +39,10 @@ pub enum Error {
Mknod { name: PathBuf, source: nix::Error },
#[error("fchownat {name:?}: {source}")]
Chown { name: PathBuf, source: nix::Error },
#[error("fchmodat {name:?}: {source}")]
Chmod { name: PathBuf, source: nix::Error },
#[error("fstatat {name:?}: {source}")]
Stat { name: PathBuf, source: nix::Error },
#[error("linkat -> {name:?}: {source}")]
Link { name: PathBuf, source: nix::Error },
#[error("dup: {0}")]
Expand All @@ -55,6 +59,8 @@ impl Error {
| Error::Write { source, .. }
| Error::Mknod { source, .. }
| Error::Chown { source, .. }
| Error::Chmod { source, .. }
| Error::Stat { source, .. }
| Error::Link { source, .. } => Some(*source),
Error::ReadDir(source) | Error::Dup(source) => Some(*source),
}
Expand Down Expand Up @@ -216,6 +222,55 @@ impl SafeDir {
})
}

/// `fstat` entry `name` relative to this dir's fd without following a final
/// symlink (`AT_SYMLINK_NOFOLLOW`). A symlink stats as itself (`S_IFLNK`),
/// never its target, so a caller inspecting the file type can reject one.
pub fn stat(&self, name: &Path) -> Result<FileStat, Error> {
fstatat(Some(self.0.as_raw_fd()), name, AtFlags::AT_SYMLINK_NOFOLLOW).map_err(|source| {
Error::Stat {
name: name.to_path_buf(),
source,
}
})
}

/// `chmod` entry `name` to `mode`. Linux's `fchmodat` has no working
/// no-follow mode (it returns `ENOTSUP`), so this follows a final symlink;
/// call it only after [`stat`](Self::stat) has proven `name` is not a
/// symlink, so the follow is a no-op on a real (non-link) entry.
pub fn chmod(&self, name: &Path, mode: u32) -> Result<(), Error> {
fchmodat(
Some(self.0.as_raw_fd()),
name,
Mode::from_bits_truncate(mode),
FchmodatFlags::FollowSymlink,
)
.map_err(|source| Error::Chmod {
name: name.to_path_buf(),
source,
})
}

/// `fchmod` this directory through its own held fd. Unlike [`chmod`](Self::chmod),
/// which re-resolves a *name*, this targets the fd we already opened
/// `O_NOFOLLOW`, so there is no path component to swap - TOCTOU-safe on the
/// directory itself.
pub fn chmod_self(&self, mode: u32) -> Result<(), Error> {
fchmod(self.0.as_raw_fd(), Mode::from_bits_truncate(mode)).map_err(|source| Error::Chmod {
name: PathBuf::from("."),
source,
})
}

/// `fchown` this directory's group through its own held fd, preserving its
/// owner (no uid passed). Same TOCTOU guarantee as [`chmod_self`](Self::chmod_self).
pub fn chgrp_self(&self, gid: u32) -> Result<(), Error> {
fchown(self.0.as_raw_fd(), None, Some(Gid::from_raw(gid))).map_err(|source| Error::Chown {
name: PathBuf::from("."),
source,
})
}

/// Remove the non-directory entry `name` from this directory.
pub fn unlink(&self, name: &Path) -> Result<(), Error> {
unlinkat(Some(self.0.as_raw_fd()), name, UnlinkatFlags::NoRemoveDir).map_err(|source| {
Expand Down
Loading
Loading