Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/midstream-container-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ jobs:
docker buildx imagetools create \
-t "${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}" \
-t "${{ env.IMAGE_REGISTRY }}/gateway:midstream" \
-t "${{ env.IMAGE_REGISTRY }}/gateway:dev" \
"${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}-amd64" \
"${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}-arm64"

Expand All @@ -159,5 +160,6 @@ jobs:
docker buildx imagetools create \
-t "${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}" \
-t "${{ env.IMAGE_REGISTRY }}/cluster:midstream" \
-t "${{ env.IMAGE_REGISTRY }}/cluster:dev" \
"${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}-amd64" \
"${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}-arm64"
11 changes: 4 additions & 7 deletions crates/openshell-bootstrap/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
use std::collections::HashMap;
use std::path::Path;

use bollard::Docker;
use bollard::query_parameters::BuildImageOptionsBuilder;
use futures::StreamExt;
use miette::{IntoDiagnostic, Result, WrapErr};
Expand Down Expand Up @@ -46,9 +45,8 @@ pub async fn build_and_push_image(
on_log(format!(
"Pushing image {tag} into gateway \"{gateway_name}\""
));
let local_docker = Docker::connect_with_local_defaults()
.into_diagnostic()
.wrap_err("failed to connect to local Docker daemon")?;
let local_docker = crate::docker::connect_local_auto()
.wrap_err("failed to connect to local container runtime")?;
let container = container_name(gateway_name);
let images: Vec<&str> = vec![tag];
push_local_images(&local_docker, &local_docker, &container, &images, on_log).await?;
Expand All @@ -68,9 +66,8 @@ async fn build_image(
build_args: &HashMap<String, String>,
on_log: &mut impl FnMut(String),
) -> Result<()> {
let docker = Docker::connect_with_local_defaults()
.into_diagnostic()
.wrap_err("failed to connect to local Docker daemon")?;
let docker = crate::docker::connect_local_auto()
.wrap_err("failed to connect to local container runtime")?;

// Compute the relative path of the Dockerfile within the context.
let dockerfile_relative = dockerfile_path
Expand Down
32 changes: 32 additions & 0 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,33 @@ pub(crate) fn connect_local(runtime: ContainerRuntime) -> Result<Docker> {
}
}

/// Connect to the local container runtime with auto-detection.
///
/// This is a convenience wrapper for code paths that need a Docker client
/// but don't have a `ContainerRuntime` value available. It auto-detects
/// the runtime (Podman preferred) and connects via `connect_local`.
pub(crate) fn connect_local_auto() -> Result<Docker> {
let runtime = crate::container_runtime::detect_runtime(None)?;
connect_local(runtime)
}

/// Connect to the local container runtime for an existing gateway.
///
/// Resolution order:
/// 1. Stored runtime from gateway metadata (if metadata exists)
/// 2. Auto-detect runtime via `detect_runtime` (propagates error on failure)
///
/// This is used by code paths that have a gateway `name` but no `runtime`
/// in scope. Unlike `connect_local_auto()`, this checks metadata first so
/// that gateways deployed with a specific runtime reconnect to the same one.
pub(crate) fn connect_for_gateway(name: &str) -> Result<Docker> {
let runtime = match crate::metadata::get_gateway_metadata(name) {
Some(m) => m.container_runtime,
None => crate::container_runtime::detect_runtime(None)?,
};
connect_local(runtime)
}

/// Build a rich, user-friendly error when a container runtime is not reachable.
fn runtime_not_reachable_error(
runtime: ContainerRuntime,
Expand Down Expand Up @@ -851,6 +878,11 @@ pub async fn ensure_container(
env_vars.push("GPU_ENABLED=true".to_string());
}

// Pass the container runtime to the entrypoint so it can select the
// appropriate networking stack (nftables kube-proxy for Podman, iptables
// DNS proxy for Docker, etc.).
env_vars.push(format!("CONTAINER_RUNTIME={}", runtime.binary_name()));

let env = Some(env_vars);

// Set the health check explicitly on the container config so it works
Expand Down
10 changes: 5 additions & 5 deletions crates/openshell-bootstrap/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ where
.collect();
if !images.is_empty() {
log("[status] Deploying components".to_string());
let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
let local_docker = docker::connect_local(runtime)?;
let container = container_name(&name);
let on_log_ref = Arc::clone(&on_log);
let mut push_log = move |msg: String| {
Expand Down Expand Up @@ -669,7 +669,7 @@ pub async fn extract_and_store_pki(
) -> Result<()> {
let docker = match remote {
Some(r) => create_ssh_docker_client(r).await?,
None => Docker::connect_with_local_defaults().into_diagnostic()?,
None => docker::connect_for_gateway(name)?,
};
let cname = docker::find_gateway_container(&docker, port).await?;
let bundle = load_existing_pki_bundle(&docker, &cname, constants::KUBECONFIG_PATH)
Expand All @@ -684,7 +684,7 @@ pub async fn ensure_gateway_image(
registry_username: Option<&str>,
registry_token: Option<&str>,
) -> Result<String> {
let docker = Docker::connect_with_local_defaults().into_diagnostic()?;
let docker = docker::connect_local_auto()?;
let image_ref = format!("{}:{version}", image::DEFAULT_GATEWAY_IMAGE);
ensure_image(&docker, &image_ref, registry_username, registry_token).await?;
Ok(image_ref)
Expand Down Expand Up @@ -712,7 +712,7 @@ pub async fn gateway_container_logs<W: std::io::Write>(

let docker = match remote {
Some(remote_opts) => create_ssh_docker_client(remote_opts).await?,
None => Docker::connect_with_local_defaults().into_diagnostic()?,
None => docker::connect_for_gateway(name)?,
};

let container = container_name(name);
Expand Down Expand Up @@ -765,7 +765,7 @@ pub async fn gateway_container_logs<W: std::io::Write>(
/// Returns an empty string on any Docker/connection error so callers don't
/// need to worry about error handling.
pub async fn fetch_gateway_logs(name: &str, n: usize) -> String {
let docker = match Docker::connect_with_local_defaults() {
let docker = match docker::connect_local_auto() {
Ok(d) => d,
Err(_) => return String::new(),
};
Expand Down
1 change: 1 addition & 0 deletions deploy/docker/Dockerfile.images
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ RUN dnf install -y fedora-repos && \
dnf install -y \
ca-certificates \
iptables \
nftables \
util-linux \
bind-utils \
&& dnf clean all
Expand Down
97 changes: 62 additions & 35 deletions deploy/docker/cluster-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,13 @@ yaml_quote() {
printf "'%s'" "$(printf '%s' "$1" | sed "s/'/''/g")"
}

# ---------------------------------------------------------------------------
# Select iptables backend
# ---------------------------------------------------------------------------
# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem
# but lack the nft_compat bridge that allows flannel and kube-proxy to use
# xt extension modules (xt_comment, xt_conntrack). Detect this by probing
# whether xt_comment is usable via the current iptables backend. If the
# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1
# externally to skip the probe and force the legacy backend.
# ---------------------------------------------------------------------------
# Check br_netfilter kernel module
# ---------------------------------------------------------------------------
# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through
# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are
# never applied to pod traffic, so pods cannot reach services such as
# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution.
# netfilter (iptables or nftables). Without it, kube-proxy's DNAT rules for
# ClusterIP services are never applied to pod traffic, so pods cannot reach
# services such as kube-dns (10.43.0.10), breaking all in-cluster DNS.
#
# The module must be loaded on the HOST before the container starts —
# containers cannot load kernel modules themselves. If it is missing, log a
Expand All @@ -65,25 +56,37 @@ if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2
fi

if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then
if iptables -t filter -N _xt_probe 2>/dev/null; then
_probe_rc=0
iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \
2>/dev/null || _probe_rc=$?
iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \
2>/dev/null || true
iptables -t filter -X _xt_probe 2>/dev/null || true
[ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1
# ---------------------------------------------------------------------------
# Select iptables backend (Docker only)
# ---------------------------------------------------------------------------
# Under Podman with nftables kube-proxy mode, the iptables backend probe is
# unnecessary — kube-proxy uses nft directly. Flannel still uses the iptables
# binary but through the nft compat shim which doesn't need the xt probe.
#
# Under Docker (or unset runtime), probe whether xt_comment is usable. Some
# kernels (e.g. Jetson Linux 5.15-tegra) have nf_tables but lack the
# nft_compat bridge. If the probe fails, switch to iptables-legacy.
if [ "${CONTAINER_RUNTIME:-}" != "podman" ]; then
if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then
if iptables -t filter -N _xt_probe 2>/dev/null; then
_probe_rc=0
iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \
2>/dev/null || _probe_rc=$?
iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \
2>/dev/null || true
iptables -t filter -X _xt_probe 2>/dev/null || true
[ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1
fi
fi
fi

if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy"
if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null &&
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then
echo "Now using iptables-legacy mode"
else
echo "Warning: could not switch to iptables-legacy — cluster networking may fail"
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy"
if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null &&
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then
echo "Now using iptables-legacy mode"
else
echo "Warning: could not switch to iptables-legacy — cluster networking may fail"
fi
fi
fi

Expand Down Expand Up @@ -174,13 +177,20 @@ setup_dns_proxy() {
echo "Configured k3s DNS to use ${CONTAINER_IP} (proxied to Docker DNS)"
}

if ! setup_dns_proxy; then
echo "DNS proxy setup failed, falling back to public DNS servers"
echo "Note: this may not work on Docker Desktop (Mac/Windows)"
cat >"$RESOLV_CONF" <<EOF
if [ "${CONTAINER_RUNTIME:-}" = "podman" ]; then
# Podman DNS is directly routable (aardvark-dns or host DNS) — no proxy
# needed. Copy the container's resolv.conf so k3s has a stable path.
cp /etc/resolv.conf "$RESOLV_CONF"
echo "Podman detected — using host DNS resolution (no proxy needed)"
else
if ! setup_dns_proxy; then
echo "DNS proxy setup failed, falling back to public DNS servers"
echo "Note: this may not work on Docker Desktop (Mac/Windows)"
cat >"$RESOLV_CONF" <<EOF
nameserver 8.8.8.8
nameserver 8.8.4.4
EOF
fi
fi

# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -632,7 +642,9 @@ fi
# On kernels where xt_comment is unavailable, kube-router's network policy
# controller panics at startup. Disable it when the iptables-legacy probe
# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead.
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
# Under Podman with nftables kube-proxy, the xt probe is skipped entirely so
# USE_IPTABLES_LEGACY is never set — network policy stays enabled.
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && [ "${CONTAINER_RUNTIME:-}" != "podman" ]; then
EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy"
fi

Expand All @@ -659,8 +671,23 @@ if [ -n "${OPENSHELL_NODE_NAME:-}" ]; then
echo "Using deterministic k3s node name: ${OPENSHELL_NODE_NAME}"
fi

# ---------------------------------------------------------------------------
# Select kube-proxy mode
# ---------------------------------------------------------------------------
# Under Podman, use native nftables kube-proxy mode so no legacy iptables
# kernel modules (ip_tables, iptable_nat, etc.) are required on the host.
# Docker retains the default iptables mode for maximum compatibility.
EXTRA_KUBE_PROXY_ARGS=""
if [ "${CONTAINER_RUNTIME:-}" = "podman" ]; then
echo "Podman detected — using nftables kube-proxy mode"
EXTRA_KUBE_PROXY_ARGS="--kube-proxy-arg=proxy-mode=nftables"
fi

# Execute k3s with explicit resolv-conf passed as a kubelet arg.
# k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag;
# it must be passed via --kubelet-arg instead.
# shellcheck disable=SC2086
exec /bin/k3s "$@" $NODE_NAME_ARG --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS
exec /bin/k3s "$@" $NODE_NAME_ARG \
--kubelet-arg=resolv-conf="$RESOLV_CONF" \
$EXTRA_KUBELET_ARGS \
$EXTRA_KUBE_PROXY_ARGS
39 changes: 28 additions & 11 deletions openshell.spec
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,29 @@ cargo build --release --bin openshell
# Install CLI binary
install -Dpm 0755 target/release/%{name} %{buildroot}%{_bindir}/%{name}

# Install modules-load.d config for legacy iptables kernel modules.
# k3s (used by the gateway cluster) bundles its own legacy iptables binary
# for flannel CNI. Modern distros (Fedora 41+, RHEL 10+) only load nf_tables
# by default, so these legacy modules must be explicitly loaded.
# Install modules-load.d config for br_netfilter.
# br_netfilter makes the kernel pass bridged (pod-to-pod) traffic through
# netfilter hooks so kube-proxy DNAT rules (iptables or nftables) apply to
# ClusterIP service traffic. Legacy iptables modules are not required —
# kube-proxy uses native nftables under Podman, and the iptables binary on
# modern distros (Fedora 41+, RHEL 10+) is iptables-nft which uses the
# nf_tables kernel path.
install -d %{buildroot}%{_modulesloaddir}
cat > %{buildroot}%{_modulesloaddir}/%{name}.conf << 'EOF'
# Load legacy iptables kernel modules required by k3s flannel CNI.
# Modern kernels use nf_tables by default; these modules provide the
# legacy iptables interface that k3s's bundled iptables-legacy needs.
ip_tables
iptable_nat
iptable_filter
iptable_mangle
# Load br_netfilter for K3s bridge networking.
# Required so kube-proxy DNAT rules (iptables or nftables) apply to
# bridged pod-to-pod traffic for ClusterIP service resolution.
br_netfilter
EOF

# Install sysctl.d config for bridge netfilter settings required by K3s.
install -d %{buildroot}%{_sysctldir}
cat > %{buildroot}%{_sysctldir}/99-%{name}.conf << 'EOF'
# Enable bridge netfilter call chains for K3s pod-to-service networking.
# Required after br_netfilter is loaded so kube-proxy DNAT rules apply
# to bridged pod traffic.
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF

# Install Python SDK modules (test files are intentionally excluded)
Expand Down Expand Up @@ -138,6 +148,12 @@ echo "rpm" > %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/INSTALL
# RECORD can be empty for RPM-managed installs
touch %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/RECORD

%post
# Load br_netfilter immediately so a reboot is not required after install.
# The modules-load.d config handles subsequent boots.
modprobe br_netfilter > /dev/null 2>&1 || :
%sysctl_apply 99-%{name}.conf

%check
# Smoke-test the CLI binary
%{buildroot}%{_bindir}/%{name} --version
Expand All @@ -153,6 +169,7 @@ PYTHONPATH=%{buildroot}%{python3_sitelib} %{python3} -c "from importlib.metadata
%doc README.md
%{_bindir}/%{name}
%{_modulesloaddir}/%{name}.conf
%{_sysctldir}/99-%{name}.conf

%files -n python3-%{name}
%license LICENSE
Expand Down
Loading