From 5ebc3ad09bcf284f36c09233ff3e02166edaed94 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 6 May 2026 11:34:48 -0400 Subject: [PATCH 1/6] initramfs: Fix overlay upper/work dir permissions to unblock unprivileged processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overlayfs merged view inherits its root permissions from the upperdir. When upper/ was created with 0700 (the same mode passed for work/), the merged / appeared as drwx------ to all non-root processes, causing dbus, systemd units that drop privileges, and anything using DAC to fail with EACCES immediately after switch-root. Fix: create upper/ with 0755 so the merged root is world-traversable. work/ remains 0700 — it is kernel-internal and never exposed in the merged view, so tighter permissions there are harmless. This mirrors what systemd does in volatile-root.c and nspawn-mount.c, and fixes the issue reported in composefs-rs#287. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- crates/initramfs/src/lib.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/initramfs/src/lib.rs b/crates/initramfs/src/lib.rs index ca3412243..f5f8c763f 100644 --- a/crates/initramfs/src/lib.rs +++ b/crates/initramfs/src/lib.rs @@ -233,11 +233,16 @@ fn overlay_state( base: impl AsFd, state: impl AsFd, source: &str, - mode: Option, + _mode: Option, mount_attr_flags: Option, ) -> Result<()> { - let upper = ensure_dir(state.as_fd(), "upper", mode)?; - let work = ensure_dir(state.as_fd(), "work", mode)?; + // upper must be 0755: the overlayfs merged view inherits permissions from + // upperdir, so 0700 would make / (or the mounted subdir) inaccessible to + // non-root processes (dbus, anything that drops privileges). + // work is kernel-internal and never visible in the merged view; 0700 is fine. + // See: https://github.com/composefs/composefs-rs/issues/287 + let upper = ensure_dir(state.as_fd(), "upper", Some(0o755.into()))?; + let work = ensure_dir(state.as_fd(), "work", Some(0o700.into()))?; let overlayfs = FsHandle::open("overlay")?; fsconfig_set_string(overlayfs.as_fd(), "source", source)?; From c3cc37175f74bc67c7d94f73649244840f655dbe Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 6 May 2026 11:34:55 -0400 Subject: [PATCH 2/6] dracut/51bootc: Auto-install setup-root-conf.toml into initramfs Image authors who ship /usr/lib/composefs/setup-root-conf.toml to configure composefs mount behaviour (e.g. transient /etc) previously had to add explicit --include flags to every dracut invocation in their Containerfile. Teach module-setup.sh to install the file automatically when present, mirroring what the composefs-rs dracut modules do. Use '[[ -e ]] && inst_simple' rather than inst_if_exists: the latter is not always available when dracut is invoked explicitly with --force in a Containerfile RUN layer (outside of kernel-install's dracut wrapper). Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- crates/initramfs/dracut/module-setup.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/initramfs/dracut/module-setup.sh b/crates/initramfs/dracut/module-setup.sh index 5250dffd3..f23c0fd64 100755 --- a/crates/initramfs/dracut/module-setup.sh +++ b/crates/initramfs/dracut/module-setup.sh @@ -17,4 +17,13 @@ install() { mkdir -p "${initdir}${systemdsystemunitdir}/initrd-root-fs.target.wants" ln_r "${systemdsystemunitdir}/${service}" \ "${systemdsystemunitdir}/initrd-root-fs.target.wants/${service}" + + # Install the host's setup-root-conf.toml if present so that + # per-image composefs mount configuration (e.g. etc.transient) is + # embedded in the initramfs without requiring manual --include flags. + # Use '[[ -e ]] && inst_simple' rather than inst_if_exists, which is + # not available in all dracut invocation contexts (e.g. explicit + # dracut --force in a Containerfile RUN layer). + [[ -e /usr/lib/composefs/setup-root-conf.toml ]] && \ + inst_simple /usr/lib/composefs/setup-root-conf.toml } From 4836343d8aaac17af48242cc429dc16c47181fca Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 7 May 2026 14:57:36 -0400 Subject: [PATCH 3/6] initramfs: Refactor overlay_transient to return detached fd overlay_transient() now returns a detached fsmount fd rather than immediately attaching it, letting the caller decide where to place the overlay. This is a correctness fix: on pre-6.15 kernels, the old code mounted the overlay then continued using the original composefs dirfd for subsequent submounts, which meant /etc and /var landed in the hidden lower layer rather than the visible merged view. The overlay source name now embeds the composefs digest as "transient:composefs=" so that composefs_booted() can extract the digest from the mount source after switch-root, the same way it does for the normal "composefs:" source. overlay_state() also loses its unused _mode parameter. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- crates/initramfs/src/lib.rs | 233 +++++++++++++++++------- crates/lib/src/bootc_composefs/state.rs | 9 +- 2 files changed, 174 insertions(+), 68 deletions(-) diff --git a/crates/initramfs/src/lib.rs b/crates/initramfs/src/lib.rs index f5f8c763f..906c1030c 100644 --- a/crates/initramfs/src/lib.rs +++ b/crates/initramfs/src/lib.rs @@ -1,7 +1,7 @@ //! Mount helpers for bootc-initramfs use std::{ - ffi::{CString, OsString}, + ffi::OsString, fmt::Debug, io::ErrorKind, os::fd::{AsFd, AsRawFd, OwnedFd}, @@ -9,8 +9,6 @@ use std::{ }; use anyhow::{Context, Result}; -use cap_std_ext::cap_std::fs::Dir; -use cap_std_ext::dirext::CapStdExtDirExt; use clap::Parser; use rustix::{ fs::{CWD, Mode, OFlags, major, minor, mkdirat, openat, stat, symlink}, @@ -82,27 +80,29 @@ fn set_mount_readonly(fd: impl AsFd) -> Result<()> { } /// Types of mounts supported by the configuration -#[derive(Clone, Copy, Debug, Deserialize)] +#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] #[serde(rename_all = "lowercase")] pub enum MountType { - /// No mount + /// No mount; "root" is an alias meaning this dir is part of the root mount + #[serde(alias = "root")] None, /// Bind mount Bind, /// Overlay mount Overlay, - /// Transient mount + /// Transient mount; "volatile" is an alias (Unix convention for tmpfs) + #[serde(alias = "volatile")] Transient, } -#[derive(Debug, Default, Deserialize)] +#[derive(Debug, Default, Deserialize, PartialEq)] struct RootConfig { #[serde(default)] transient: bool, } /// Configuration for mount operations -#[derive(Debug, Default, Deserialize)] +#[derive(Debug, Default, Deserialize, PartialEq)] pub struct MountConfig { /// The type of mount to use pub mount: Option, @@ -111,19 +111,16 @@ pub struct MountConfig { pub transient: bool, } -#[derive(Deserialize, Default)] +#[derive(Debug, Deserialize, Default, PartialEq)] struct Config { #[serde(default)] etc: MountConfig, #[serde(default)] - var: MountConfig, - #[serde(default)] root: RootConfig, } /// Command-line arguments #[derive(Parser, Debug)] -#[command(version)] pub struct Args { #[arg(help = "Execute this command (for testing)")] /// Execute this command (for testing) @@ -205,21 +202,15 @@ fn bind_mount(fd: impl AsFd, path: &str) -> Result { Ok(res?) } -/// Mount a tmpfs, inheriting the SELinux label from the base filesystem -/// if provided. See . +/// Mount a tmpfs to use as the upper layer for an overlay. +/// +/// TODO: sync these options with systemd's root mounting, there's some tweaks there for default tmpfs +/// and we may want to make this configurable anyways i nthe future +/// +/// See . #[context("Mounting tmpfs for overlay")] -fn mount_tmpfs_for_overlay(base: Option) -> Result { +fn mount_tmpfs_for_overlay() -> Result { let tmpfs = FsHandle::open("tmpfs")?; - - if let Some(base_fd) = base { - let base_dir = Dir::reopen_dir(&base_fd.as_fd())?; - if let Some(label) = base_dir.getxattr(".", "security.selinux")? { - if let Ok(cstr) = CString::new(label) { - fsconfig_set_string(tmpfs.as_fd(), "rootcontext", &cstr)?; - } - } - } - fsconfig_create(tmpfs.as_fd())?; Ok(fsmount( tmpfs.as_fd(), @@ -228,19 +219,18 @@ fn mount_tmpfs_for_overlay(base: Option) -> Result { )?) } -#[context("Mounting state as overlay")] -fn overlay_state( +/// Build an overlayfs fsmount fd from an existing state dir (upper+work). +/// +/// upper is 0755: the merged view inherits permissions from upperdir, so 0700 +/// would make the mountpoint inaccessible to non-root processes. work is +/// kernel-internal and never visible; 0700 is fine. +/// See: +fn build_overlay_fd( base: impl AsFd, state: impl AsFd, source: &str, - _mode: Option, mount_attr_flags: Option, -) -> Result<()> { - // upper must be 0755: the overlayfs merged view inherits permissions from - // upperdir, so 0700 would make / (or the mounted subdir) inaccessible to - // non-root processes (dbus, anything that drops privileges). - // work is kernel-internal and never visible in the merged view; 0700 is fine. - // See: https://github.com/composefs/composefs-rs/issues/287 +) -> Result { let upper = ensure_dir(state.as_fd(), "upper", Some(0o755.into()))?; let work = ensure_dir(state.as_fd(), "work", Some(0o700.into()))?; @@ -250,33 +240,50 @@ fn overlay_state( overlayfs_set_fd(overlayfs.as_fd(), "upperdir", upper.as_fd())?; overlayfs_set_lower_and_data_fds(&overlayfs, base.as_fd(), None::)?; fsconfig_create(overlayfs.as_fd())?; - let fs = fsmount( + Ok(fsmount( overlayfs.as_fd(), FsMountFlags::FSMOUNT_CLOEXEC, mount_attr_flags.unwrap_or(MountAttrFlags::empty()), - )?; + )?) +} +/// Mount a persistent state directory as an overlay on top of `base`, +/// attaching the result immediately at `.` relative to `base`. +#[context("Mounting state as overlay")] +fn overlay_state( + base: impl AsFd, + state: impl AsFd, + source: &str, + mount_attr_flags: Option, +) -> Result<()> { + let fs = build_overlay_fd(&base, state, source, mount_attr_flags)?; mount_at_wrapper(fs, base, ".").context("Moving mount") } -/// Mounts a transient overlayfs with passed in fd as the lowerdir. +/// Creates a transient overlayfs with the passed-in fd as the lowerdir. /// -/// The tmpfs used for the overlay upper layer inherits the SELinux label -/// from the base filesystem to prevent label mismatches (see #1992). -#[context("Mounting transient overlayfs")] +/// Returns a detached (not yet attached) `OwnedFd` for the overlay mount. +/// The caller is responsible for attaching it to the filesystem tree. +/// +/// `source` is used verbatim as the overlay's `source` fsconfig option and +/// appears in `/proc/self/mountinfo`. For the composefs root, pass +/// `"transient:composefs="` so that `composefs_booted()` can +/// recover the verity digest from the mount source after switch-root. For +/// non-root transient mounts (e.g. `/usr`, `/var`) pass `"transient"`. +/// +/// The SELinux label on `/` is fixed after boot by +/// `bootc-early-overlay-relabel.service`; no initramfs-side xattr write is +/// needed (kernel `fs_use_trans tmpfs` relabeling at policy-load time would +/// overwrite anything written here). +#[context("Creating transient overlayfs")] pub fn overlay_transient( base: impl AsFd, - mode: Option, + source: &str, mount_attr_flags: Option, -) -> Result<()> { - let tmpfs = mount_tmpfs_for_overlay(Some(&base))?; - overlay_state( - base, - prepare_mount(tmpfs)?, - "transient", - mode, - mount_attr_flags, - ) +) -> Result { + let tmpfs = mount_tmpfs_for_overlay()?; + let state = prepare_mount(tmpfs)?; + build_overlay_fd(base, state, source, mount_attr_flags) } #[context("Opening rootfs")] @@ -349,9 +356,14 @@ pub fn mount_subdir( open_dir(&state, subdir)?, "overlay", None, - None, ), - MountType::Transient => overlay_transient(open_dir(&new_root, subdir)?, None, None), + MountType::Transient => { + // For subdirectory transient mounts, create the overlay and immediately + // attach it at the subdirectory path in new_root. + let subdir_fd = open_dir(&new_root, subdir)?; + let overlay_fd = overlay_transient(subdir_fd.as_fd(), "transient", None)?; + mount_at_wrapper(overlay_fd, &new_root, subdir) + } } } @@ -394,8 +406,8 @@ pub fn setup_root(args: Args) -> Result<()> { let (image, insecure) = get_cmdline_composefs::(&cmdline)?; - let new_root = match args.root_fs { - Some(path) => open_root_fs(&path).context("Failed to clone specified root fs")?, + let new_root = match &args.root_fs { + Some(path) => open_root_fs(path).context("Failed to clone specified root fs")?, None => mount_composefs_image(&sysroot, &image.to_hex(), insecure)?, }; @@ -413,25 +425,122 @@ pub fn setup_root(args: Args) -> Result<()> { mount_at_wrapper(&new_root, CWD, &mount_target)?; } - if config.root.transient { - overlay_transient(&new_root, None, None)?; - } + // When transient root is enabled, place an overlay on top of the composefs. + // On pre-6.15, since the composefs is already attached at `mount_target`, + // the overlay is also immediately attached there. We then open the overlay + // via its path so that subsequent mounts target the visible merged tree. + // + // On 6.15+, the whole tree is assembled in floating mode; `overlay_transient` + // returns a detached overlay fd that we can directly mount into. + // + // `new_root` always refers to the composefs fd; mounting via it after the + // overlay is in place would land in the hidden lower layer. + let transient_overlay_fd: Option = if config.root.transient { + let overlay_fd = overlay_transient( + &new_root, + &format!("transient:composefs={}", image.to_hex()), + None, + )?; + + if cfg!(feature = "pre-6.15") { + // In pre-6.15, the composefs is already attached at `mount_target`. + // Attach the overlay on top of it, then reopen the path to get a + // dirfd that resolves through the overlay (not the hidden composefs). + mount_at_wrapper(&overlay_fd, CWD, &mount_target) + .context("Moving transient overlay onto sysroot")?; + Some(open_dir(CWD, &mount_target).context("Opening attached overlay root")?) + } else { + // On 6.15+ we assemble in floating mode; use the detached overlay fd + // directly for subsequent mounts into the tree. + Some(overlay_fd) + } + } else { + None + }; - match composefs::mount::mount_at(&sysroot_clone, &new_root, "sysroot") { + // When transient root is active the overlay sits on top of the composefs. + // Mounts placed via `new_root` would land in the composefs lower layer and + // be invisible from the running system. Use the overlay fd for all + // post-overlay mounts (sysroot, etc, var) so they appear in the merged view. + let visible_root: &dyn AsFd = transient_overlay_fd + .as_ref() + .map_or(&new_root as &dyn AsFd, |fd| fd as &dyn AsFd); + + // Mount the physical sysroot (with the composefs repo) into the new root + // so that `bootc status` and other tools can find it after switch-root. + match composefs::mount::mount_at(&sysroot_clone, visible_root, "sysroot") { Ok(()) | Err(Errno::NOENT) => {} Err(err) => Err(err)?, } // etc + var let state = open_dir(open_dir(&sysroot, "state/deploy")?, image.to_hex())?; - mount_subdir(&new_root, &state, "etc", config.etc, MountType::Bind)?; - mount_subdir(&new_root, &state, "var", config.var, MountType::Bind)?; + mount_subdir(visible_root, &state, "etc", config.etc, MountType::Bind)?; + mount_subdir(visible_root, &state, "var", MountConfig::default(), MountType::Bind)?; if cfg!(not(feature = "pre-6.15")) { - // Replace the /sysroot with the new composed root filesystem + // Replace the /sysroot with the new composed root filesystem. + // When a transient overlay is active, mount it rather than the bare + // composefs so the running system sees the writable merged view. unmount(&args.sysroot, UnmountFlags::DETACH)?; - mount_at_wrapper(&new_root, CWD, &mount_target)?; + mount_at_wrapper(visible_root, CWD, &mount_target)?; } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(toml: &str) -> Config { + toml::from_str(toml).expect("TOML parse failed") + } + + #[test] + fn test_config_defaults() { + let config = parse(""); + assert_eq!( + config, + Config { + etc: MountConfig { + mount: None, + transient: false + }, + root: RootConfig { transient: false }, + } + ); + } + + #[test] + fn test_mounttype_none() { + let config = parse("[etc]\nmount = \"none\""); + assert_eq!(config.etc.mount, Some(MountType::None)); + } + + #[test] + fn test_mounttype_root_alias() { + let config = parse("[etc]\nmount = \"root\""); + assert_eq!(config.etc.mount, Some(MountType::None)); + } + + #[test] + fn test_etc_transient_flag() { + let config = parse("[etc]\ntransient = true"); + assert_eq!(config.etc.transient, true); + assert_eq!(config.etc.mount, None); + } + + #[test] + fn test_root_transient() { + let config = parse("[root]\ntransient = true"); + assert_eq!(config.root.transient, true); + } + + #[test] + fn test_combined_config() { + let config = parse("[root]\ntransient = true\n[etc]\nmount = \"root\""); + assert_eq!(config.root.transient, true); + assert_eq!(config.etc.mount, Some(MountType::None)); + } +} diff --git a/crates/lib/src/bootc_composefs/state.rs b/crates/lib/src/bootc_composefs/state.rs index 6c14f954b..0b72c1108 100644 --- a/crates/lib/src/bootc_composefs/state.rs +++ b/crates/lib/src/bootc_composefs/state.rs @@ -4,7 +4,7 @@ use std::path::Path; use std::{fs::create_dir_all, process::Command}; use anyhow::{Context, Result}; -use bootc_initramfs_setup::overlay_transient; +use bootc_initramfs_setup::{mount_at_wrapper, overlay_transient}; use bootc_kernel_cmdline::utf8::Cmdline; use bootc_mount::tempmount::TempMount; use bootc_utils::CommandRunExt; @@ -323,16 +323,13 @@ pub(crate) fn composefs_usr_overlay(access_mode: FilesystemOverlayAccessMode) -> let usr = Dir::open_ambient_dir("/usr", ambient_authority()).context("Opening /usr")?; - // Get the mode from the underlying /usr directory - let usr_metadata = usr.metadata(".").context("Getting /usr metadata")?; - let usr_mode = Mode::from_raw_mode(usr_metadata.permissions().mode()); - let mount_attr_flags = match access_mode { FilesystemOverlayAccessMode::ReadOnly => Some(MountAttrFlags::MOUNT_ATTR_RDONLY), FilesystemOverlayAccessMode::ReadWrite => None, }; - overlay_transient(usr, Some(usr_mode), mount_attr_flags)?; + let overlay_fd = overlay_transient(usr.as_fd(), "transient", mount_attr_flags)?; + mount_at_wrapper(overlay_fd, &usr, ".").context("Attaching /usr overlay")?; println!("A {} overlayfs is now mounted on /usr", access_mode); println!("All changes there will be discarded on reboot."); From 10691fca406f545162c27dac0e74f83a8a7dc615 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 7 May 2026 14:57:41 -0400 Subject: [PATCH 4/6] status: Support transient root in composefs_booted() When root.transient = true, bootc-root-setup wraps the composefs lower in an overlayfs whose source is "transient:composefs=" rather than "composefs:". Handle both prefixes uniformly so that composefs_booted() works correctly on transient root boots and soft-reboots are detected the same way in both cases. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- crates/lib/src/bootc_composefs/status.rs | 39 +++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/crates/lib/src/bootc_composefs/status.rs b/crates/lib/src/bootc_composefs/status.rs index 7e08fa115..bd8b7a19c 100644 --- a/crates/lib/src/bootc_composefs/status.rs +++ b/crates/lib/src/bootc_composefs/status.rs @@ -61,6 +61,9 @@ pub(crate) struct ImgConfigManifest { pub(crate) struct ComposefsCmdline { pub allow_missing_fsverity: bool, pub digest: Box, + /// True when the root is a transient overlay (source prefix `transient:composefs=`). + /// Set by [`composefs_booted`]; always `false` when constructed from a cmdline string. + pub is_transient: bool, } /// Information about a deployment for soft reboot comparison @@ -79,6 +82,7 @@ impl ComposefsCmdline { ComposefsCmdline { allow_missing_fsverity, digest: digest_str.into(), + is_transient: false, } } @@ -86,6 +90,7 @@ impl ComposefsCmdline { ComposefsCmdline { allow_missing_fsverity, digest: digest.into(), + is_transient: false, } } @@ -159,17 +164,37 @@ pub(crate) fn composefs_booted() -> Result> { // Find the source of / mountpoint as the cmdline doesn't change on soft-reboot let root_mnt = inspect_filesystem("/".into())?; - // This is of the format composefs: - let verity_from_mount_src = root_mnt - .source - .strip_prefix("composefs:") - .ok_or_else(|| anyhow::anyhow!("Root not mounted using composefs"))?; + // The mount source encodes the composefs digest in one of two formats: + // - Normal boot: "composefs:" + // - Transient root: "transient:composefs=" + // Strip either prefix to get the digest and record whether the root is + // transient, then compare the digest with the cmdline value to detect + // soft-reboots into a different deployment. + let (verity_from_mount_src, is_transient) = + if let Some(v) = root_mnt.source.strip_prefix("composefs:") { + (v, false) + } else if let Some(v) = root_mnt.source.strip_prefix("transient:composefs=") { + (v, true) + } else { + anyhow::bail!( + "Root not mounted using composefs (source: {})", + root_mnt.source + ) + }; let r = if *verity_from_mount_src != *v.digest { // soft rebooted into another deployment - CACHED_DIGEST_VALUE.get_or_init(|| Some(ComposefsCmdline::new(verity_from_mount_src))) + CACHED_DIGEST_VALUE.get_or_init(|| { + let mut c = ComposefsCmdline::new(verity_from_mount_src); + c.is_transient = is_transient; + Some(c) + }) } else { - CACHED_DIGEST_VALUE.get_or_init(|| Some(v)) + CACHED_DIGEST_VALUE.get_or_init(|| { + let mut c = v; + c.is_transient = is_transient; + Some(c) + }) }; Ok(r.as_ref()) From 99bd81d74badc3b93ca110ae7ae78a877ce331d2 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 7 May 2026 13:51:21 -0400 Subject: [PATCH 5/6] generator: Add bootc-early-overlay-relabel for transient overlay SELinux fix Transient overlays (/) inherit tmpfs_t from the upper dir's tmpfs via fs_use_trans at SELinux policy-load time. Add a generator-emitted oneshot unit, bootc-early-overlay-relabel.service, that runs 'bootc internals relabel-overlay-mountpoints' before sysinit.target to restore the correct label on each writable overlayfs mount point. Two detection paths, both needed because the generator runs before local-fs.target: - Root writability: inspect the mount source for the "transient:composefs=" prefix to detect a transient root overlay. - Subdir mounts (/etc): bootc-root-setup.service mounts these after the generator, so we read setup-root-conf.toml directly from the booted image to know whether /etc will be a transient overlay. The detection block runs before the OSTREE_BOOTED guard: native composefs boots do not write /run/ostree-booted, but still need the relabel unit. relabel_overlay_mountpoints() checks both OVERLAYFS_SUPER_MAGIC and !RDONLY to distinguish writable transient overlays from the read-only composefs root (both are overlayfs, only the former needs relabelling). Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- Cargo.lock | 1 + crates/initramfs/Cargo.toml | 1 + crates/initramfs/src/lib.rs | 33 ++++ crates/lib/src/cli.rs | 6 + crates/lib/src/generator.rs | 156 +++++++++++++++++- .../units/bootc-early-overlay-relabel.service | 12 ++ 6 files changed, 205 insertions(+), 4 deletions(-) create mode 100644 crates/lib/src/units/bootc-early-overlay-relabel.service diff --git a/Cargo.lock b/Cargo.lock index 0e30bc0f3..3ece52930 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -239,6 +239,7 @@ dependencies = [ "rustix", "serde", "toml 1.1.2+spec-1.1.0", + "tracing", ] [[package]] diff --git a/crates/initramfs/Cargo.toml b/crates/initramfs/Cargo.toml index 204558d87..b0e237507 100644 --- a/crates/initramfs/Cargo.toml +++ b/crates/initramfs/Cargo.toml @@ -14,6 +14,7 @@ rustix.workspace = true serde = { workspace = true, features = ["derive"] } composefs-ctl.workspace = true toml.workspace = true +tracing.workspace = true fn-error-context.workspace = true bootc-kernel-cmdline = { path = "../kernel_cmdline", version = "0.0.0" } diff --git a/crates/initramfs/src/lib.rs b/crates/initramfs/src/lib.rs index 906c1030c..51beaef3a 100644 --- a/crates/initramfs/src/lib.rs +++ b/crates/initramfs/src/lib.rs @@ -119,6 +119,39 @@ struct Config { root: RootConfig, } +/// Default path to the setup-root configuration file, relative to the booted root. +pub const SETUP_ROOT_CONF_PATH: &str = "/usr/lib/composefs/setup-root-conf.toml"; + +/// Returns `true` if the configuration at `path` requests a transient `/etc` +/// overlay. Used by the systemd generator to decide whether to emit the +/// SELinux relabel unit *before* those mounts exist (the generator runs before +/// `local-fs.target`). +/// +/// Returns `false` if the file is absent or unreadable (safe default: no unit +/// emitted for non-transient systems). +pub fn config_has_transient_submounts(path: &std::path::Path) -> bool { + let text = match std::fs::read_to_string(path) { + Ok(t) => t, + Err(e) => { + tracing::debug!("Could not read {}: {e:#}", path.display()); + return false; + } + }; + let config: Config = match toml::from_str(&text) { + Ok(c) => c, + Err(e) => { + tracing::debug!("Could not parse {}: {e:#}", path.display()); + return false; + } + }; + // Only /etc overlay triggers the relabel unit. + let is_transient = |mc: &MountConfig| match mc.mount { + Some(mt) => mt == MountType::Transient, + None => mc.transient, + }; + is_transient(&config.etc) +} + /// Command-line arguments #[derive(Parser, Debug)] pub struct Args { diff --git a/crates/lib/src/cli.rs b/crates/lib/src/cli.rs index 48559f34d..f0bc8357b 100644 --- a/crates/lib/src/cli.rs +++ b/crates/lib/src/cli.rs @@ -649,6 +649,9 @@ pub(crate) enum InternalsOpts { /// Relabel this path path: Utf8PathBuf, }, + /// Relabel the overlay mount point inodes after SELinux policy load. + /// Called by the generated bootc-early-overlay-relabel unit. + RelabelOverlayMountpoints, /// Proxy frontend for the `ostree-ext` CLI. OstreeExt { #[clap(allow_hyphen_values = true)] @@ -2112,6 +2115,9 @@ async fn run_from_opt(opt: Opt) -> Result<()> { crate::lsm::relabel_recurse(root, path, as_path.as_deref(), sepolicy)?; Ok(()) } + InternalsOpts::RelabelOverlayMountpoints => { + crate::generator::relabel_overlay_mountpoints() + } InternalsOpts::BootcInstallCompletion { sysroot, stateroot } => { let rootfs = &Dir::open_ambient_dir("/", cap_std::ambient_authority())?; crate::install::completion::run_from_ostree(rootfs, &sysroot, &stateroot).await diff --git a/crates/lib/src/generator.rs b/crates/lib/src/generator.rs index 0447ffa51..09ff63b68 100644 --- a/crates/lib/src/generator.rs +++ b/crates/lib/src/generator.rs @@ -1,11 +1,12 @@ use std::io::BufRead; use anyhow::{Context, Result}; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use cap_std::fs::Dir; use cap_std_ext::{cap_std, dirext::CapStdExtDirExt}; use fn_error_context::context; use ostree_ext::container_utils::{OSTREE_BOOTED, is_ostree_booted_in}; +use ostree_ext::{gio, ostree}; use rustix::{fd::AsFd, fs::StatVfsMountFlags}; use crate::install::DESTRUCTIVE_CLEANUP; @@ -17,6 +18,8 @@ const MULTI_USER_TARGET: &str = "multi-user.target"; const EDIT_UNIT: &str = "bootc-fstab-edit.service"; const FSTAB_ANACONDA_STAMP: &str = "Created by anaconda"; pub(crate) const BOOTC_EDITED_STAMP: &str = "Updated by bootc-fstab-edit.service"; +const TRANSIENT_RELABEL_UNIT: &str = "bootc-early-overlay-relabel.service"; +const SYSINIT_TARGET: &str = "sysinit.target"; /// Called when the root is read-only composefs to reconcile /etc/fstab #[context("bootc generator")] @@ -86,7 +89,53 @@ pub(crate) fn unit_enablement_impl(sysroot: &Dir, unit_dir: &Dir) -> Result<()> /// Main entrypoint for the generator pub(crate) fn generator(root: &Dir, unit_dir: &Dir) -> Result<()> { - // Only run on ostree systems + // === Relabel unit: runs for ALL composefs boots (native or ostree) === + // Must be before the ostree-booted guard because native composefs boots do + // not write /run/ostree-booted, but still need the relabel unit when any + // transient overlay is active. + // + // Gate on the root being overlayfs (composefs always mounts an overlay, so + // this excludes non-composefs systems without needing the ostree-booted marker). + // + // Two triggering conditions, detected independently: + // + // 1. Transient root: the initramfs sets the overlay source to + // "transient:composefs=" in /proc/self/mountinfo. Detect via + // inspect_filesystem() rather than fstatvfs() because the `ro` kernel + // cmdline flag can make an otherwise-writable overlay appear read-only + // at generator time. + // + // 2. Transient /etc: this is mounted by bootc-root-setup.service + // which runs *after* the generator, so fstatvfs would see the read-only + // composefs at generator time. Read setup-root-conf.toml directly from + // the booted image instead. + { + let st = rustix::fs::fstatfs(root.as_fd())?; + if st.f_type == libc::OVERLAYFS_SUPER_MAGIC { + let root_is_transient = + match bootc_mount::inspect_filesystem(camino::Utf8Path::new("/")) { + Ok(fs) => fs.source.starts_with("transient:composefs="), + Err(e) => { + tracing::debug!("Could not inspect root filesystem: {e:#}"); + false + } + }; + let submounts_are_transient = bootc_initramfs_setup::config_has_transient_submounts( + std::path::Path::new(bootc_initramfs_setup::SETUP_ROOT_CONF_PATH), + ); + if root_is_transient || submounts_are_transient { + tracing::debug!( + root_is_transient, + submounts_are_transient, + "Transient overlay detected; generating relabel unit" + ); + generate_transient_overlay_relabel(unit_dir)?; + } + } + } + + // === Ostree-specific generator logic === + // Only run on ostree systems (native composefs boots skip below). if !root.try_exists(OSTREE_BOOTED)? { return Ok(()); } @@ -97,15 +146,17 @@ pub(crate) fn generator(root: &Dir, unit_dir: &Dir) -> Result<()> { unit_enablement_impl(sysroot, unit_dir)?; - // Also only run if the root is a read-only overlayfs (a composefs really) + // Only run for overlayfs roots (composefs mounts an overlay, regular or transient). let st = rustix::fs::fstatfs(root.as_fd())?; if st.f_type != libc::OVERLAYFS_SUPER_MAGIC { tracing::trace!("Root is not overlayfs"); return Ok(()); } + + // The fstab editor only applies to read-only composefs roots (not transient). let st = rustix::fs::fstatvfs(root.as_fd())?; if !st.f_flag.contains(StatVfsMountFlags::RDONLY) { - tracing::trace!("Root is writable"); + tracing::trace!("Root is writable, skipping fstab generator"); return Ok(()); } @@ -137,6 +188,58 @@ ExecStart=bootc internals fixup-etc-fstab\n\ Ok(()) } +/// Generate a oneshot service that relabels the transient overlay inode +/// after SELinux policy loads, fixing the tmpfs_t label SELinux assigns to +/// overlay upper-dir inodes at policy-load time. +fn generate_transient_overlay_relabel(unit_dir: &Dir) -> Result<()> { + unit_dir.atomic_write( + TRANSIENT_RELABEL_UNIT, + include_str!("units/bootc-early-overlay-relabel.service"), + )?; + let wants = format!("{SYSINIT_TARGET}.wants"); + unit_dir.create_dir_all(&wants)?; + unit_dir.symlink( + &format!("../{TRANSIENT_RELABEL_UNIT}"), + &format!("{wants}/{TRANSIENT_RELABEL_UNIT}"), + )?; + Ok(()) +} + +/// Relabel transient overlay mount point inodes using the running SELinux policy. +/// Called by the generated bootc-early-overlay-relabel.service oneshot to fix +/// the tmpfs_t label that fs_use_trans assigns to overlay upper-dir inodes at +/// policy-load time. Each of /, /etc, /var is relabelled iff it is a writable +/// overlayfs (i.e. a transient overlay, not the read-only composefs). +pub(crate) fn relabel_overlay_mountpoints() -> Result<()> { + let policy = ostree::SePolicy::new(&gio::File::for_path("/"), gio::Cancellable::NONE) + .context("Loading SELinux policy")?; + for path in ["/", "/etc", "/var"] { + let dir = Dir::open_ambient_dir(path, cap_std::ambient_authority()) + .with_context(|| format!("Opening {path}"))?; + let st = rustix::fs::fstatfs(dir.as_fd())?; + if st.f_type != libc::OVERLAYFS_SUPER_MAGIC { + tracing::trace!("{path} is not an overlayfs mount, skipping relabel"); + continue; + } + let stv = rustix::fs::fstatvfs(dir.as_fd())?; + if stv.f_flag.contains(StatVfsMountFlags::RDONLY) { + tracing::trace!("{path} is a read-only overlayfs (composefs), skipping relabel"); + continue; + } + let metadata = dir.metadata(".").with_context(|| format!("stat {path}"))?; + crate::lsm::relabel( + &dir, + &metadata, + Utf8Path::new("."), + Some(Utf8Path::new(path)), + &policy, + ) + .with_context(|| format!("Relabelling {path}"))?; + tracing::debug!("Relabelled {path}"); + } + Ok(()) +} + #[cfg(test)] mod tests { use camino::Utf8Path; @@ -241,6 +344,51 @@ mod tests { Ok(()) } + #[test] + fn test_transient_overlay_relabel_generated() -> Result<()> { + let tempdir = fixture()?; + let unit_dir = &tempdir.open_dir("run/systemd/system")?; + + // We can't fake fstatfs or findmnt, so call generate_transient_overlay_relabel directly. + generate_transient_overlay_relabel(unit_dir)?; + + // The unit file must exist + assert!(unit_dir.try_exists(TRANSIENT_RELABEL_UNIT)?); + // The symlink in sysinit.target.wants must point at the generated unit + let wants = format!("{SYSINIT_TARGET}.wants"); + let link = unit_dir.read_link_contents(format!("{wants}/{TRANSIENT_RELABEL_UNIT}"))?; + let link: camino::Utf8PathBuf = link.try_into().unwrap(); + assert_eq!(link, format!("../{TRANSIENT_RELABEL_UNIT}")); + // The unit must invoke bootc internals relabel-overlay-mountpoints + let content = unit_dir.read_to_string(TRANSIENT_RELABEL_UNIT)?; + assert!( + content.contains("ExecStart=bootc internals relabel-overlay-mountpoints"), + "unexpected unit content: {content}" + ); + + Ok(()) + } + + #[test] + fn test_transient_overlay_relabel_idempotent() -> Result<()> { + let tempdir = fixture()?; + let unit_dir = &tempdir.open_dir("run/systemd/system")?; + + // Calling generate_transient_overlay_relabel twice must succeed + generate_transient_overlay_relabel(unit_dir)?; + // Second call: atomic_write overwrites the unit file; symlink already exists + // (symlink won't be re-created because the dir already contains it). + // The test just checks the call doesn't error. + // We need to remove the old symlink first (same as how enable_unit does it). + let wants = format!("{SYSINIT_TARGET}.wants"); + unit_dir.remove_file_optional(format!("{wants}/{TRANSIENT_RELABEL_UNIT}"))?; + generate_transient_overlay_relabel(unit_dir)?; + + assert!(unit_dir.try_exists(TRANSIENT_RELABEL_UNIT)?); + + Ok(()) + } + #[test] fn test_generator_fstab_idempotent() -> Result<()> { let anaconda_fstab = indoc::indoc! { " diff --git a/crates/lib/src/units/bootc-early-overlay-relabel.service b/crates/lib/src/units/bootc-early-overlay-relabel.service new file mode 100644 index 000000000..3369d9e48 --- /dev/null +++ b/crates/lib/src/units/bootc-early-overlay-relabel.service @@ -0,0 +1,12 @@ +[Unit] +Description=Fix SELinux labels on transient overlay mount points +Documentation=man:bootc(1) +DefaultDependencies=no +ConditionSecurity=selinux +After=local-fs.target +Before=sysinit.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=bootc internals relabel-overlay-mountpoints From 73ae51e98aa9f034df3442c20737ec7be38dc291 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Fri, 15 May 2026 13:32:53 -0400 Subject: [PATCH 6/6] composefs: Support transient /etc, transient root, and volatile /var Add TOML configuration (setup-root-conf.toml) for composefs mount behaviour: - [root] transient = true: wrap the composefs in a tmpfs overlay; all writes are discarded on reboot. - [etc] mount = transient|overlay|bind|none: control how /etc is mounted from the deployment state directory. - [var] mount = none|bind: control whether /var is bind-mounted from state. When mount = none, /var is left as an empty composefs directory. bootc-root-setup also detects the systemd.volatile=state kernel argument at boot time and automatically skips the /var state bind-mount when it is set, leaving /var empty for systemd-fstab-generator to mount a fresh tmpfs there at local-fs.target. This is the recommended way to get an ephemeral /var: it uses a plain tmpfs rather than overlayfs, which is compatible with tools like podman that use overlayfs under /var/lib/containers. Add inject-baseconfig CI helper, a test-baseconfigs CI job, and a 040-test-baseconfigs.nu integration test that boots each configuration in a VM and validates filesystem types, writability, SELinux labels, and podman graph driver compatibility. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- .github/workflows/ci.yml | 76 +++++++++- Dockerfile | 5 + Justfile | 33 ++++- contrib/packaging/inject-baseconfig | 61 ++++++++ crates/initramfs/src/lib.rs | 45 +++++- docs/src/SUMMARY.md | 1 + docs/src/man/bootc-root-setup.service.5.md | 78 ++++------ docs/src/man/bootc-setup-root-conf.5.md | 137 ++++++++++++++++++ .../booted/readonly/040-test-baseconfigs.nu | 83 +++++++++++ 9 files changed, 461 insertions(+), 58 deletions(-) create mode 100755 contrib/packaging/inject-baseconfig create mode 100644 docs/src/man/bootc-setup-root-conf.5.md create mode 100644 tmt/tests/booted/readonly/040-test-baseconfigs.nu diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2ac6b6943..975a5b3b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -220,10 +220,8 @@ jobs: bootloader: ["grub", "systemd"] boot_type: ["bls", "uki"] seal_state: ["sealed", "unsealed"] - exclude: - # centos-9 fails with EUCLEAN (https://github.com/bootc-dev/bootc/issues/1812) - # See: https://github.com/bootc-dev/bcvk/pull/204 + # https://github.com/bootc-dev/bootc/issues/1812 - test_os: centos-9 variant: composefs - seal_state: "sealed" @@ -385,6 +383,76 @@ jobs: name: "tmt-log-${{ matrix.test_os }}-${{ matrix.variant }}-upgrade-${{ env.ARCH }}" path: /var/tmp/tmt + # Test readonly behaviour with baseconfigs (transient mounts) baked into the image. + # Composefs-only: setup-root-conf.toml is a composefs concept; ostree uses a + # different config format (prepare-root.conf) and is not covered here. + # Runs once per distro × baseconfig — no bootloader/filesystem/boot_type matrix. + test-baseconfigs: + if: needs.compute-ci-level.outputs.run_heavy == 'true' + needs: [compute-ci-level, package] + strategy: + fail-fast: false + matrix: + test_os: ${{ fromJson(needs.compute-ci-level.outputs.integration_os_matrix) }} + baseconfigs: ["etc-transient", "root-transient", "var-volatile"] + + runs-on: ubuntu-24.04 + + steps: + - uses: actions/checkout@v6 + - name: Bootc Ubuntu Setup + uses: bootc-dev/actions/bootc-ubuntu-setup@main + with: + libvirt: true + - name: Install tmt + run: pip install --user "tmt[provision-virtual]" + + - name: Setup env + run: | + BASE=$(just pullspec-for-os base ${{ matrix.test_os }}) + echo "BOOTC_base=${BASE}" >> $GITHUB_ENV + echo "BOOTC_variant=composefs" >> $GITHUB_ENV + echo "BOOTC_baseconfigs=${{ matrix.baseconfigs }}" >> $GITHUB_ENV + echo "RUST_BACKTRACE=full" >> $GITHUB_ENV + + - name: Download package artifacts + uses: actions/download-artifact@v8 + with: + name: packages-${{ matrix.test_os }} + path: target/packages/ + + - name: Build container with baseconfig + run: BOOTC_SKIP_PACKAGE=1 just build + + - name: Build upgrade image + run: just _build-upgrade-image + + - name: Run TMT readonly tests + run: | + cargo xtask run-tmt \ + --env=BOOTC_variant=composefs \ + --env=BOOTC_baseconfigs=${{ matrix.baseconfigs }} \ + --composefs-backend --bootloader=grub --filesystem=ext4 \ + --seal-state=unsealed --boot-type=bls \ + --upgrade-image=localhost/bootc-upgrade \ + localhost/bootc readonly + just clean-local-images + + - name: Disk usage summary + if: always() + run: | + echo "### Disk usage" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + df -h >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + + - name: Archive TMT logs + if: always() + uses: actions/upload-artifact@v7 + with: + name: "tmt-log-${{ matrix.test_os }}-composefs-baseconfigs-${{ matrix.baseconfigs }}-${{ env.ARCH }}" + path: /var/tmp/tmt + # Test bootc install on Fedora CoreOS (separate job to avoid disk space issues # when run in the same job as test-integration). # Uses fedora-43 as it's the current stable Fedora release matching CoreOS. @@ -471,7 +539,7 @@ jobs: # Accepts 'skipped' as success so that merge_group-only jobs don't block PRs. required-checks: if: always() - needs: [compute-ci-level, cargo-deny, validate, install-tests, docs, package, test-integration, test-upgrade, test-container-export] + needs: [compute-ci-level, cargo-deny, validate, install-tests, docs, package, test-integration, test-upgrade, test-baseconfigs, test-container-export] runs-on: ubuntu-latest steps: - name: Check all jobs diff --git a/Dockerfile b/Dockerfile index e90ad035a..31f515800 100644 --- a/Dockerfile +++ b/Dockerfile @@ -222,6 +222,7 @@ FROM base as base-penultimate ARG variant ARG bootloader ARG boot_type +ARG baseconfigs="" # Switch to a signed systemd-boot, if configured RUN --network=none --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \ @@ -251,6 +252,10 @@ ARG rootfs="" RUN --network=none --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \ --mount=type=bind,from=packaging,src=/,target=/run/packaging \ /run/packaging/configure-rootfs "${variant}" "${rootfs}" +# Inject base configuration (e.g. transient-etc, transient-root) before dracut runs +RUN --network=none --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \ + --mount=type=bind,from=packaging,src=/,target=/run/packaging \ + /run/packaging/inject-baseconfig "${variant}" "${baseconfigs}" # Override with our built package RUN --network=none --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \ --mount=type=bind,from=packaging,src=/,target=/run/packaging \ diff --git a/Justfile b/Justfile index b1a81796e..6ba80d0c7 100644 --- a/Justfile +++ b/Justfile @@ -43,6 +43,8 @@ filesystem := env("BOOTC_filesystem", "ext4") boot_type := env("BOOTC_boot_type", "bls") # Only used for composefs tests seal_state := env("BOOTC_seal_state", "unsealed") +# Baseconfigs to inject into the image for testing (e.g. "etc-transient" or "root-transient") +baseconfigs := env("BOOTC_baseconfigs", "") # Base container image to build from base := env("BOOTC_base", "quay.io/centos-bootc/centos-bootc:stream10") # Buildroot base image @@ -56,18 +58,21 @@ no_auto_local_deps := env("BOOTC_no_auto_local_deps", "") # Internal variables nocache := env("BOOTC_nocache", "") _nocache_arg := if nocache != "" { "--no-cache" } else { "" } +_baseconfigs_env := if baseconfigs != "" { "--env=BOOTC_baseconfigs=" + baseconfigs } else { "" } testimage_label := "bootc.testimage=1" lbi_images := "quay.io/curl/curl:latest quay.io/curl/curl-base:latest registry.access.redhat.com/ubi9/podman:latest" fedora-coreos := "quay.io/fedora/fedora-coreos:testing-devel" generic_buildargs := "" _extra_src_args := if extra_src != "" { "-v " + extra_src + ":/run/extra-src:ro --security-opt=label=disable" } else { "" } +# filesystem arg: required for bootc container ukify to allow missing fsverity base_buildargs := generic_buildargs + " " + _extra_src_args \ + " --build-arg=base=" + base \ + " --build-arg=variant=" + variant \ + " --build-arg=bootloader=" + bootloader \ + " --build-arg=boot_type=" + boot_type \ + " --build-arg=seal_state=" + seal_state \ - + " --build-arg=filesystem=" + filesystem # required for bootc container ukify to allow missing fsverity + + " --build-arg=filesystem=" + filesystem \ + + " --build-arg=baseconfigs=" + baseconfigs buildargs := base_buildargs \ + " --cap-add=all --security-opt=label=type:container_runtime_t --device /dev/fuse" \ + " --secret=id=secureboot_key,src=target/test-secureboot/db.key --secret=id=secureboot_cert,src=target/test-secureboot/db.crt" @@ -266,7 +271,31 @@ test-container-export: build # Run tmt tests without rebuilding (for fast iteration) [group('testing')] test-tmt-nobuild *ARGS: - cargo xtask run-tmt --env=BOOTC_variant={{variant}} --upgrade-image={{upgrade_img}} {{base_img}} {{ARGS}} + cargo xtask run-tmt --env=BOOTC_variant={{variant}} {{_baseconfigs_env}} --upgrade-image={{upgrade_img}} {{base_img}} {{ARGS}} + +# Run readonly tests with a baseconfig baked into the image at build time. +# Requires composefs variant. Example: just variant=composefs test-tmt-baseconfig root-transient +[group('testing')] +test-tmt-baseconfig baseconfig *ARGS: + just variant=composefs baseconfigs={{baseconfig}} build + just variant=composefs baseconfigs={{baseconfig}} _build-upgrade-image + cargo xtask run-tmt \ + --env=BOOTC_variant=composefs \ + --env=BOOTC_baseconfigs={{baseconfig}} \ + --upgrade-image={{upgrade_img}} \ + --composefs-backend \ + --bootloader={{bootloader}} \ + --filesystem={{filesystem}} \ + --boot-type={{boot_type}} \ + --seal-state={{seal_state}} \ + {{base_img}} readonly {{ARGS}} + +# Run readonly tests for all standard baseconfigs +[group('testing')] +test-baseconfigs *ARGS: + just test-tmt-baseconfig etc-transient {{ARGS}} + just test-tmt-baseconfig root-transient {{ARGS}} + just test-tmt-baseconfig var-volatile {{ARGS}} # Run tmt tests on Fedora CoreOS [group('testing')] diff --git a/contrib/packaging/inject-baseconfig b/contrib/packaging/inject-baseconfig new file mode 100755 index 000000000..041810a8b --- /dev/null +++ b/contrib/packaging/inject-baseconfig @@ -0,0 +1,61 @@ +#!/bin/bash +# Inject base configuration files for CI testing of transient-root/etc/var configurations. +# Arguments: $1=variant, $2=baseconfigs (comma-separated, may be empty) +set -xeuo pipefail + +VARIANT="${1:-}" +BASECONFIGS="${2:-}" + +# No-op if no baseconfigs specified +if [ -z "${BASECONFIGS}" ]; then + exit 0 +fi + +# setup-root-conf.toml is composefs-specific; ostree uses prepare-root.conf +# which has a different (INI) format and different option names. +case "${VARIANT}" in + composefs*) + TARGET="/usr/lib/composefs/setup-root-conf.toml" + ;; + *) + echo "inject-baseconfig: baseconfigs not supported for variant '${VARIANT}'" >&2 + exit 1 + ;; +esac + +mkdir -p "$(dirname "${TARGET}")" + +# Split on commas and process each token +IFS=',' read -ra TOKENS <<< "${BASECONFIGS}" +for raw_token in "${TOKENS[@]}"; do + # Trim leading/trailing spaces + token="${raw_token#"${raw_token%%[![:space:]]*}"}" + token="${token%"${token##*[![:space:]]}"}" + + [ -z "${token}" ] && continue + + case "${token}" in + etc-transient) + printf '[etc]\ntransient = true\n' >> "${TARGET}" + ;; + root-transient) + printf '[root]\ntransient = true\n' >> "${TARGET}" + ;; + var-volatile) + # Mount /var as a fresh tmpfs on every boot via systemd.volatile=state. + # bootc-root-setup detects this karg in the initramfs and automatically + # skips the /var state bind-mount, leaving /var as an empty directory + # from the composefs image. systemd-fstab-generator then mounts a fresh + # tmpfs there at local-fs.target. Using a plain tmpfs avoids the + # overlayfs-on-overlayfs restriction that breaks tools like podman which + # use overlayfs under /var/lib/containers. + mkdir -p /usr/lib/bootc/kargs.d + printf 'kargs = ["systemd.volatile=state"]\n' \ + > /usr/lib/bootc/kargs.d/50-var-volatile.toml + ;; + *) + echo "Unknown baseconfig: ${token}" >&2 + exit 1 + ;; + esac +done diff --git a/crates/initramfs/src/lib.rs b/crates/initramfs/src/lib.rs index 51beaef3a..c15dae95d 100644 --- a/crates/initramfs/src/lib.rs +++ b/crates/initramfs/src/lib.rs @@ -116,6 +116,8 @@ struct Config { #[serde(default)] etc: MountConfig, #[serde(default)] + var: MountConfig, + #[serde(default)] root: RootConfig, } @@ -437,6 +439,30 @@ pub fn setup_root(args: Args) -> Result<()> { .cmdline .unwrap_or(Cmdline::from_proc().context("Failed to read cmdline")?); + // Auto-detect systemd.volatile=state: if the kernel cmdline requests a + // volatile /var via the systemd fstab-generator, skip our initramfs + // bind-mount of /var from the deployment state directory. This leaves + // /var as an empty directory from the composefs image so that + // systemd-fstab-generator can mount a fresh tmpfs there at local-fs.target. + // An explicit `[var] mount = "none"` in setup-root-conf.toml has the same + // effect; the cmdline check is a convenience so users only need the kargs.d + // entry without also editing setup-root-conf.toml. + let config = { + let mut config = config; + // value_of returns None for a missing key, Some("") for a bare flag, + // or Some("state") / Some("overlay") / Some("yes") for key=value form. + let volatile_val = cmdline.value_of("systemd.volatile"); + let var_volatile = matches!(volatile_val, Some("state") | Some("overlay")); + if var_volatile && config.var.mount.is_none() && !config.var.transient { + tracing::debug!( + "systemd.volatile={} detected; skipping /var state bind-mount", + volatile_val.unwrap_or("") + ); + config.var.mount = Some(MountType::None); + } + config + }; + let (image, insecure) = get_cmdline_composefs::(&cmdline)?; let new_root = match &args.root_fs { @@ -509,7 +535,12 @@ pub fn setup_root(args: Args) -> Result<()> { // etc + var let state = open_dir(open_dir(&sysroot, "state/deploy")?, image.to_hex())?; mount_subdir(visible_root, &state, "etc", config.etc, MountType::Bind)?; - mount_subdir(visible_root, &state, "var", MountConfig::default(), MountType::Bind)?; + // /var is bind-mounted from the deployment state directory by default. + // The systemd.volatile=state cmdline detection above (or an explicit + // [var] mount = "none" in setup-root-conf.toml) can change this to + // MountType::None, which skips the bind-mount entirely and leaves /var + // as an empty directory from the composefs image for systemd to fill. + mount_subdir(visible_root, &state, "var", config.var, MountType::Bind)?; if cfg!(not(feature = "pre-6.15")) { // Replace the /sysroot with the new composed root filesystem. @@ -540,6 +571,10 @@ mod tests { mount: None, transient: false }, + var: MountConfig { + mount: None, + transient: false + }, root: RootConfig { transient: false }, } ); @@ -564,6 +599,14 @@ mod tests { assert_eq!(config.etc.mount, None); } + #[test] + fn test_var_none() { + // mount = "none" skips the state bind-mount; combine with + // systemd.volatile=state karg to get a fresh tmpfs on every boot. + let config = parse("[var]\nmount = \"none\""); + assert_eq!(config.var.mount, Some(MountType::None)); + } + #[test] fn test_root_transient() { let config = parse("[root]\ntransient = true"); diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index d4d046523..f793f7948 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -62,6 +62,7 @@ - [composefs backend](experimental-composefs.md) - [unified storage](experimental-unified-storage.md) - [`man bootc-root-setup.service`](man/bootc-root-setup.service.5.md) +- [`man bootc-setup-root-conf.toml`](man/bootc-setup-root-conf.5.md) - [fsck](experimental-fsck.md) - [install reset](experimental-install-reset.md) - [--progress-fd](experimental-progress-fd.md) diff --git a/docs/src/man/bootc-root-setup.service.5.md b/docs/src/man/bootc-root-setup.service.5.md index 7cee4308a..d711968d3 100644 --- a/docs/src/man/bootc-root-setup.service.5.md +++ b/docs/src/man/bootc-root-setup.service.5.md @@ -4,69 +4,45 @@ bootc-root-setup.service # DESCRIPTION -This service runs in the initramfs to set up the root filesystem when composefs is enabled. -It is only activated when the `composefs` kernel command line parameter is present. +A oneshot systemd service that runs in the initramfs to set up the root +filesystem when the composefs backend is active. It is gated on the +`composefs=` kernel command line argument and on +`ConditionPathExists=/etc/initrd-release`, so it only runs inside an initramfs. -The service performs the following operations: +The service is ordered after `sysroot.mount` and before +`initrd-root-fs.target`. It performs the following steps: -- Mounts the composefs image specified in the kernel command line -- Sets up `/etc` and `/var` directories from the deployment state -- Optionally configures transient overlays based on the configuration file -- Prepares the root filesystem for switch-root +1. Opens the composefs repository at `/sysroot/composefs`. +2. Mounts the EROFS image identified by the `composefs=` kernel + argument, with fs-verity verification. +3. Optionally wraps the root in a transient tmpfs overlay + (see `root.transient` in **bootc-setup-root-conf.toml(5)**). +4. Bind-mounts or overlays `/etc` and `/var` from the per-deployment state + directory at `/sysroot/state/deploy//`. +5. Replaces `/sysroot` with the fully assembled root, ready for switch-root. -This service runs after `sysroot.mount` and `ostree-prepare-root.service`, and before -`initrd-root-fs.target`. +# CONFIGURATION -# CONFIGURATION FILE +Behaviour is controlled by an optional TOML file installed into the initramfs: -The service reads an optional configuration file at `/usr/lib/composefs/setup-root-conf.toml`. -If this file does not exist, default settings are used. +`/usr/lib/composefs/setup-root-conf.toml` -**WARNING**: The configuration file format and composefs integration are experimental -and subject to change. +See **bootc-setup-root-conf.toml(5)** for the full option reference. -## Configuration Options +# INSTALLATION -The configuration file uses TOML format with the following sections: +The service and its binary (`/usr/lib/bootc/initramfs-setup`) are installed +into the initramfs by the `51bootc` dracut module. The module also installs +`/usr/lib/composefs/setup-root-conf.toml` when it is present on the host image, +so image authors do not need manual `dracut --include` invocations. -### `[root]` - -- `transient` (boolean): If true, mounts the root filesystem as a transient overlay. - This makes all changes to `/` ephemeral and lost on reboot. Default: false. - -### `[etc]` - -- `mount` (string): Mount type for `/etc`. Options: "none", "bind", "overlay", "transient". - Default: "bind". -- `transient` (boolean): Shorthand for `mount = "transient"`. Default: false. - -### `[var]` - -- `mount` (string): Mount type for `/var`. Options: "none", "bind", "overlay", "transient". - Default: "bind". -- `transient` (boolean): Shorthand for `mount = "transient"`. Default: false. - -## Example Configuration - -```toml -[root] -transient = false - -[etc] -mount = "bind" - -[var] -mount = "overlay" -``` - -# EXPERIMENTAL STATUS - -The composefs integration, including this service and its configuration file format, -is experimental and subject to change. +The `51bootc` module is *not* enabled by default (so that e.g. `apt|dnf install bootc` +don't pull it in). It's recommended for base images to enable it via a config file +in e.g. `/usr/lib/dracut/dracut.conf.d`. # SEE ALSO -**bootc(8)** +**bootc-setup-root-conf.toml(5)**, **bootc(8)** # VERSION diff --git a/docs/src/man/bootc-setup-root-conf.5.md b/docs/src/man/bootc-setup-root-conf.5.md new file mode 100644 index 000000000..3e7082f06 --- /dev/null +++ b/docs/src/man/bootc-setup-root-conf.5.md @@ -0,0 +1,137 @@ +# NAME + +bootc-setup-root-conf.toml + +# SYNOPSIS + +`/usr/lib/composefs/setup-root-conf.toml` + +# DESCRIPTION + +When the composefs backend is active, `bootc-root-setup.service` runs in the +initramfs to mount the root filesystem before switch-root. It reads this +optional TOML configuration file to control how `/`, `/etc`, and `/var` are +mounted. + +If the file does not exist all options take their documented defaults. + +The `51bootc` dracut module installs this file into the initramfs automatically +when it is present on the host image. Image authors can therefore ship the +file at this path in their container image and rebuild the initramfs with a +plain `dracut --force`; no `--include` flags are needed. + +**NOTE**: The composefs backend and this configuration file are experimental +and subject to change without notice. + +# SECTIONS + +## `[root]` + +Controls the mount of the root (`/`) filesystem. + +`transient` (boolean, default: `false`) + If `true`, the composefs root is wrapped in a tmpfs overlay before + switch-root. All writes to `/` are discarded at the next reboot. + This is useful for kiosk or lab systems where runtime modifications + must never persist. + +## `[etc]` + +Controls how `/etc` is mounted from the deployment state directory. + +`mount` (string) + One of `"none"`, `"bind"` (default), `"overlay"`, or `"transient"`. + + - `none` (alias `"root"`) — `/etc` is not remounted; the composefs image's + `/etc` is used directly and is read-only. The system sees exactly the + `/etc` baked into the container image, with no per-deployment state + overlaid on top. **This requires the OS and all services to work without + a writable `/etc`**: SSH host keys, `machine-id`, NetworkManager leases, + and similar files that are normally generated on first boot into the + deployment's writable `/etc` must either be baked into the image or + generated elsewhere (e.g. `/var`, systemd credentials). This mode is + most useful in combination with `[root] transient = true`, where the + transient overlay already provides a writable surface over `/etc`. + - `bind` — `/etc` is bind-mounted from the deployment state directory, + preserving persistent per-machine changes across reboots (the default + ostree behaviour). + - `overlay` — `/etc` is an overlayfs with the deployment state as the upper + layer; writes go to state and persist across reboots. + - `transient` — `/etc` is a tmpfs overlay; all runtime edits are discarded + on reboot. Suitable for immutable or sealed images where `/etc` drift + is undesirable. + +`transient` (boolean, default: `false`) + Shorthand for `mount = "transient"`. Ignored when `mount` is also set. + +## `[var]` + +Controls how `/var` is mounted from the deployment state directory. + +`mount` (string) + One of `"bind"` (default) or `"none"`. + + - `bind` — `/var` is bind-mounted from the deployment state directory, + preserving persistent per-machine data across reboots (the default). + - `none` (alias `"root"`) — `/var` is not remounted; the composefs image's + empty `/var` directory is used. Combine with `systemd.volatile=state` + (see below) to get a fresh tmpfs on every boot. + +For a fresh, ephemeral `/var` on every boot (e.g. for stateless or kiosk +systems), use the `systemd.volatile=state` kernel argument. `bootc-root-setup` +detects this karg automatically and skips the `/var` bind-mount, so no +explicit `[var]` section is needed. The karg can be baked into the image via +`/usr/lib/bootc/kargs.d/`: + +```toml +# /usr/lib/bootc/kargs.d/50-var-volatile.toml +kargs = ["systemd.volatile=state"] +``` + +This causes systemd to mount `/var` as a plain tmpfs at early boot, which is +fully compatible with tools like podman that use overlayfs under `/var`. +Note: unlike `/etc` and `/root`, using overlayfs (the `"transient"` mount type +from earlier releases) for `/var` is not supported because it breaks podman and +other tools that use overlayfs under `/var/lib/containers`. + +# EXAMPLES + +Default (all persistent, equivalent to an absent file): + +```toml +[etc] +mount = "bind" +``` + +Transient `/etc` — suitable for sealed or integrity-verified images where +runtime `/etc` changes should be discarded on reboot: + +```toml +[etc] +transient = true +``` + +Transient root with read-only `/etc` — `/` and `/etc` follow the composefs +image exactly within the session (all writes discarded on reboot). To also +make `/var` ephemeral, combine with `systemd.volatile=state` in kargs.d: + +```toml +[root] +transient = true + +[etc] +mount = "root" +``` + +# FILES + +`/usr/lib/composefs/setup-root-conf.toml` + The configuration file read by `bootc-root-setup.service`. + +# SEE ALSO + +**bootc-root-setup.service(5)**, **bootc(8)** + +# VERSION + + diff --git a/tmt/tests/booted/readonly/040-test-baseconfigs.nu b/tmt/tests/booted/readonly/040-test-baseconfigs.nu new file mode 100644 index 000000000..a8f6e510d --- /dev/null +++ b/tmt/tests/booted/readonly/040-test-baseconfigs.nu @@ -0,0 +1,83 @@ +use std assert +use tap.nu + +tap begin "baseconfig validation tests" + +# No-op when no baseconfigs are active +let baseconfigs = $env.BOOTC_baseconfigs? | default "" +if $baseconfigs == "" { + print "# BOOTC_baseconfigs not set, skipping baseconfig tests" + tap ok + exit +} + +let configs = ($baseconfigs | split row "," | each { |c| $c | str trim } | where { |c| $c != "" }) + +for config in $configs { + match $config { + "etc-transient" => { + print "# Checking etc-transient: /etc should be an overlay mount" + let mnt = (findmnt /etc -J | from json) + let fstype = $mnt.filesystems.0.fstype + assert equal $fstype "overlay" "/etc should be mounted as overlay when etc-transient is active" + + print "# Checking / has 755 permissions (overlay upper dir must not block traversal)" + let perms = (stat -c "%a" / | str trim) + assert equal $perms "755" "/ should have 755 permissions" + + print "# Checking /etc is writable (transient overlay)" + let result = (do { ^touch /etc/bootc-baseconfig-test } | complete) + assert equal $result.exit_code 0 "/etc should be writable with etc-transient" + rm -f /etc/bootc-baseconfig-test + }, + "root-transient" => { + print "# Checking root-transient: / should be an overlay mount" + let mnt = (findmnt / -J | from json) + let fstype = $mnt.filesystems.0.fstype + assert equal $fstype "overlay" "/ should be mounted as overlay when root-transient is active" + + print "# Checking / has 755 permissions" + let perms = (stat -c "%a" / | str trim) + assert equal $perms "755" "/ should have 755 permissions" + + print "# Checking / is writable (transient overlay)" + let result = (do { ^touch /bootc-baseconfig-root-test } | complete) + assert equal $result.exit_code 0 "/ should be writable with root-transient" + rm -f /bootc-baseconfig-root-test + + # The whole point of bootc-early-overlay-relabel.service is to + # fix the / inode label from tmpfs_t back to root_t after policy loads. + print "# Checking / SELinux label is root_t (not tmpfs_t)" + let label = (^stat -c "%C" / | str trim) + assert ($label | str ends-with ":root_t:s0") $"/ SELinux label should end with :root_t:s0, got: ($label)" + }, + "var-volatile" => { + # /var is a fresh tmpfs on every boot via systemd.volatile=state karg. + # It must NOT be an overlay (that would break podman's storage driver). + print "# Checking var-volatile: /var should be a tmpfs (systemd.volatile=state)" + let var_fstype = (findmnt /var -J -o FSTYPE | from json).filesystems.0.fstype + assert equal $var_fstype "tmpfs" $"/var should be tmpfs with var-volatile, got: ($var_fstype)" + print "# /var is tmpfs ✓" + + print "# Checking /var is writable" + let result = (do { ^touch /var/bootc-baseconfig-var-volatile-test } | complete) + assert equal $result.exit_code 0 "/var should be writable with var-volatile" + rm -f /var/bootc-baseconfig-var-volatile-test + + # The raison d'être for using tmpfs rather than overlayfs: podman + # must be able to use its overlay storage driver on top of /var. + # If /var were overlayfs, podman would fall back to vfs or fail entirely. + print "# Checking podman overlay storage driver works on tmpfs /var" + let podman_result = (do { ^podman info --format "{{.Store.GraphDriverName}}" } | complete) + assert equal $podman_result.exit_code 0 "podman info should succeed" + let driver = ($podman_result.stdout | str trim) + assert equal $driver "overlay" $"podman should use overlay driver on tmpfs /var, got: ($driver)" + print $"# podman graph driver: ($driver) ✓" + }, + _ => { + print $"# Unknown baseconfig token: ($config) — skipping" + } + } +} + +tap ok