From 80e94a27c73d7223e42461f8accb974b7af50dc0 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 3 Oct 2024 23:33:12 +0100 Subject: [PATCH] Support containers launched with user namespace --- src/util/namespace.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/util/namespace.rs b/src/util/namespace.rs index de75d66..341b67c 100644 --- a/src/util/namespace.rs +++ b/src/util/namespace.rs @@ -1,9 +1,11 @@ use std::fs::File; use std::os::fd::AsFd; +use std::os::unix::fs::MetadataExt; use anyhow::Result; +use rustix::fs::{Gid, Uid}; use rustix::process::Pid; -use rustix::thread::{LinkNameSpaceType, UnshareFlags}; +use rustix::thread::{CapabilitiesSecureBits, LinkNameSpaceType, UnshareFlags}; pub struct Namespace { mnt_fd: File, @@ -32,6 +34,35 @@ impl Namespace { self.mnt_fd.as_fd(), Some(LinkNameSpaceType::Mount), )?; + + // If user namespace is used, we must act like the root user *inside* + // namespace to be able to create files properly (otherwise EOVERFLOW + // will be returned when creating file). + // + // Entering the user namespace turns out to be problematic. + // The reason seems to be this line [1]: + // which means `CAP_MKNOD` capability of the *init* namespace is needed. + // However task's associated security context is all relative to its current + // user namespace [2], so once you enter a user namespace there's no way of getting + // back `CAP_MKNOD` of the init namespace anymore. + // (Yes this means that even if CAP_MKNOD is granted to the container, you canot + // create device nodes within it.) + // + // https://elixir.bootlin.com/linux/v6.11.1/source/fs/namei.c#L4073 + // https://elixir.bootlin.com/linux/v6.11.1/source/include/linux/cred.h#L111 + let metadata = std::fs::metadata("/")?; + let uid = metadata.uid(); + let gid = metadata.gid(); + + // By default `setuid` will drop capabilities when transitioning from root + // to non-root user. This bit prevents it so our code still have superpower. + rustix::thread::set_capabilities_secure_bits( + CapabilitiesSecureBits::NO_SETUID_FIXUP, + )?; + + rustix::thread::set_thread_uid(unsafe { Uid::from_raw(uid) })?; + rustix::thread::set_thread_gid(unsafe { Gid::from_raw(gid) })?; + Ok(f()?) }) .join()