From 4b8ec322e0082acfe0d6a056c6346bc09dbd4a1d Mon Sep 17 00:00:00 2001 From: Yashodhan Joshi Date: Wed, 9 Aug 2023 12:20:41 +0530 Subject: [PATCH] Fix rootless determination logic, to accomodate podman rootless behvaiour --- crates/libcgroups/src/common.rs | 18 ++-- .../src/container/builder_impl.rs | 2 + .../libcontainer/src/container/container.rs | 9 ++ .../src/container/container_delete.rs | 1 + .../src/container/container_events.rs | 1 + .../src/container/container_kill.rs | 2 + .../src/container/container_pause.rs | 1 + .../src/container/container_resume.rs | 1 + .../src/container/init_builder.rs | 5 +- crates/libcontainer/src/container/state.rs | 3 + .../src/process/container_init_process.rs | 3 +- .../src/process/container_main_process.rs | 9 +- crates/libcontainer/src/rootless.rs | 85 +++++++++++++++---- crates/libcontainer/src/utils.rs | 7 ++ crates/youki/src/commands/mod.rs | 1 + 15 files changed, 120 insertions(+), 28 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index a0d20424d..5e7ebe1ab 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -316,11 +316,12 @@ pub enum CreateCgroupSetupError { Systemd(#[from] systemd::manager::SystemdManagerError), } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct CgroupConfig { pub cgroup_path: PathBuf, pub systemd_cgroup: bool, pub container_name: String, + pub use_system_bus: bool, } pub fn create_cgroup_manager( @@ -342,7 +343,12 @@ pub fn create_cgroup_manager( if cgroup_path.is_absolute() || !config.systemd_cgroup { return Ok(create_v2_cgroup_manager(cgroup_path)?.any()); } - Ok(create_systemd_cgroup_manager(cgroup_path, config.container_name.as_str())?.any()) + Ok(create_systemd_cgroup_manager( + cgroup_path, + config.container_name.as_str(), + config.use_system_bus, + )? + .any()) } } } @@ -381,6 +387,7 @@ fn create_v2_cgroup_manager( fn create_systemd_cgroup_manager( cgroup_path: &Path, container_name: &str, + use_system_bus: bool, ) -> Result { if !systemd::booted() { panic!( @@ -388,17 +395,15 @@ fn create_systemd_cgroup_manager( ); } - let use_system = nix::unistd::geteuid().is_root(); - tracing::info!( "systemd cgroup manager with system bus {} will be used", - use_system + use_system_bus ); systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), cgroup_path.to_owned(), container_name.into(), - use_system, + use_system_bus, ) } @@ -406,6 +411,7 @@ fn create_systemd_cgroup_manager( fn create_systemd_cgroup_manager( _cgroup_path: &Path, _container_name: &str, + _use_system_bus: bool, ) -> Result { Err(systemd::manager::SystemdManagerError::NotEnabled) } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 4eb5b6c64..c3560a71c 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -77,6 +77,7 @@ impl ContainerBuilderImpl { cgroup_path: cgroups_path, systemd_cgroup: self.use_systemd || self.rootless.is_some(), container_name: self.container_id.to_owned(), + use_system_bus: self.rootless.is_none(), }; let process = self .spec @@ -191,6 +192,7 @@ impl ContainerBuilderImpl { cgroup_path: cgroups_path, systemd_cgroup: self.use_systemd || self.rootless.is_some(), container_name: self.container_id.to_string(), + use_system_bus: self.rootless.is_none(), })?; let mut errors = Vec::new(); diff --git a/crates/libcontainer/src/container/container.rs b/crates/libcontainer/src/container/container.rs index 241114653..bb32ddbda 100644 --- a/crates/libcontainer/src/container/container.rs +++ b/crates/libcontainer/src/container/container.rs @@ -126,11 +126,20 @@ impl Container { self.state.use_systemd } + pub fn is_rootless(&self) -> bool { + self.state.is_rootless + } + pub fn set_systemd(&mut self, should_use: bool) -> &mut Self { self.state.use_systemd = should_use; self } + pub fn set_rootless(&mut self, is_rootless: bool) -> &mut Self { + self.state.is_rootless = is_rootless; + self + } + pub fn set_clean_up_intel_rdt_directory(&mut self, clean_up: bool) -> &mut Self { self.state.clean_up_intel_rdt_subdirectory = Some(clean_up); self diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index 0322fd392..b2d5b0029 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -86,6 +86,7 @@ impl Container { cgroup_path: config.cgroup_path.to_owned(), systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: !self.is_rootless(), }, )?; cmanager.remove().map_err(|err| { diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index ed253be3a..f69eb482e 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -38,6 +38,7 @@ impl Container { cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: !self.is_rootless(), })?; match stats { true => { diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 6b5d08540..7117c828f 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -83,6 +83,7 @@ impl Container { cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: !self.is_rootless(), }, )?; cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; @@ -100,6 +101,7 @@ impl Container { cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: !self.is_rootless(), })?; if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 8e7248e46..c5b14fdac 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -37,6 +37,7 @@ impl Container { cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: self.is_rootless(), })?; cmanager.freeze(FreezerState::Frozen)?; diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 544b8e8e5..e8a5f0eba 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -39,6 +39,7 @@ impl Container { cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), + use_system_bus: self.is_rootless(), })?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 38da5c9b2..bc62a4d23 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -85,7 +85,6 @@ impl InitContainerBuilder { } else { None }; - let rootless = Rootless::new(&spec)?; let config = YoukiConfig::from_spec(&spec, container.id(), rootless.is_some())?; config.save(&container_dir).map_err(|err| { @@ -93,6 +92,10 @@ impl InitContainerBuilder { err })?; + container.set_rootless(rootless.is_some()); + + container.save()?; + let mut builder_impl = ContainerBuilderImpl { container_type: ContainerType::InitContainer, syscall: self.base.syscall, diff --git a/crates/libcontainer/src/container/state.rs b/crates/libcontainer/src/container/state.rs index 5b6bd6f18..0c335c048 100644 --- a/crates/libcontainer/src/container/state.rs +++ b/crates/libcontainer/src/container/state.rs @@ -121,6 +121,8 @@ pub struct State { pub use_systemd: bool, // Specifies if the Intel RDT subdirectory needs be cleaned up. pub clean_up_intel_rdt_subdirectory: Option, + // specifies if the container was rootless or not + pub is_rootless: bool, } impl State { @@ -143,6 +145,7 @@ impl State { creator: None, use_systemd: false, clean_up_intel_rdt_subdirectory: None, + is_rootless: false, } } diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 5b6045be5..2322e1fdc 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -376,7 +376,8 @@ pub fn container_init_process( })?; } - let bind_service = namespaces.get(LinuxNamespaceType::User)?.is_some(); + let bind_service = + namespaces.get(LinuxNamespaceType::User)?.is_some() || utils::is_in_new_userns(); let rootfs = RootFS::new(); rootfs .prepare_rootfs( diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 38f7c0d60..578edc04a 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -6,6 +6,7 @@ use crate::{ intel_rdt::setup_intel_rdt, }, rootless::Rootless, + utils, }; use nix::sys::wait::{waitpid, WaitStatus}; use nix::unistd::Pid; @@ -91,9 +92,11 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // the main process to set up uid and gid mapping, once the intermediate // process enters into a new user namespace. if let Some(rootless) = &container_args.rootless { - main_receiver.wait_for_mapping_request()?; - setup_mapping(rootless, intermediate_pid)?; - inter_sender.mapping_written()?; + if !utils::is_in_new_userns() { + main_receiver.wait_for_mapping_request()?; + setup_mapping(rootless, intermediate_pid)?; + inter_sender.mapping_written()?; + } } // At this point, we don't need to send any message to intermediate process anymore, diff --git a/crates/libcontainer/src/rootless.rs b/crates/libcontainer/src/rootless.rs index 87eff2495..d513dc332 100644 --- a/crates/libcontainer/src/rootless.rs +++ b/crates/libcontainer/src/rootless.rs @@ -1,7 +1,10 @@ use crate::error::MissingSpecError; use crate::namespaces::{NamespaceError, Namespaces}; +use crate::utils; use nix::unistd::Pid; -use oci_spec::runtime::{Linux, LinuxIdMapping, LinuxNamespace, LinuxNamespaceType, Mount, Spec}; +use oci_spec::runtime::{ + Linux, LinuxIdMapping, LinuxIdMappingBuilder, LinuxNamespace, LinuxNamespaceType, Mount, Spec, +}; use std::fs; use std::path::Path; use std::process::Command; @@ -150,11 +153,13 @@ impl Rootless { // If conditions requires us to use rootless, we must either create a new // user namespace or enter an existing. - if rootless_required() && user_namespace.is_none() { + if rootless_required() && user_namespace.is_none() && !utils::is_in_new_userns() { return Err(RootlessError::NoUserNamespace); } - if user_namespace.is_some() && user_namespace.unwrap().path().is_none() { + if (user_namespace.is_some() && user_namespace.unwrap().path().is_none()) + || utils::is_in_new_userns() + { tracing::debug!("rootless container should be created"); validate_spec_for_rootless(spec).map_err(|err| { @@ -220,7 +225,7 @@ impl TryFrom<&Linux> for Rootless { uid_mappings: linux.uid_mappings().to_owned(), gid_mappings: linux.gid_mappings().to_owned(), user_namespace: user_namespace.cloned(), - privileged: nix::unistd::geteuid().is_root(), + privileged: !rootless_required(), rootless_id_mapper: RootlessIDMapper::new(), }) } @@ -232,6 +237,10 @@ pub fn rootless_required() -> bool { return true; } + if utils::is_in_new_userns() { + return true; + } + matches!(std::env::var("YOUKI_USE_ROOTLESS").as_deref(), Ok("true")) } @@ -255,24 +264,66 @@ pub fn unprivileged_user_ns_enabled() -> Result { } } +fn parse_mapping_entry(entry: &str) -> (u32, u32, u32) { + let components: Vec<_> = entry.split_ascii_whitespace().collect(); + if components.len() != 3 { + panic!("invalid mapping entry {}", entry); + } + ( + components[0].parse().unwrap(), + components[1].parse().unwrap(), + components[2].parse().unwrap(), + ) +} + +fn get_id_mapping(path: &str) -> Result> { + let uid_map_str = std::fs::read_to_string(path) + .unwrap_or_else(|_| panic!("failed to read {path} for rootless container mapping entries")); + let mut entries = Vec::new(); + for entry in uid_map_str.lines() { + let (container, host, size) = parse_mapping_entry(entry); + entries.push( + LinuxIdMappingBuilder::default() + .container_id(container) + .host_id(host) + .size(size) + .build() + .unwrap(), + ); + } + Ok(entries) +} + /// Validates that the spec contains the required information for /// running in rootless mode fn validate_spec_for_rootless(spec: &Spec) -> std::result::Result<(), ValidateSpecError> { tracing::debug!(?spec, "validating spec for rootless container"); let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; - if namespaces.get(LinuxNamespaceType::User)?.is_none() { + if namespaces.get(LinuxNamespaceType::User)?.is_none() && !utils::is_in_new_userns() { return Err(ValidateSpecError::NoUserNamespace); } - let gid_mappings = linux - .gid_mappings() - .as_ref() - .ok_or(ValidateSpecError::NoGIDMapping)?; - let uid_mappings = linux - .uid_mappings() - .as_ref() - .ok_or(ValidateSpecError::NoUIDMappings)?; + let (uid_mappings, gid_mappings) = if !utils::is_in_new_userns() { + // if we are not already in a user namespace, then the mappings must be present + ( + linux + .uid_mappings() + .as_ref() + .ok_or(ValidateSpecError::NoUIDMappings)? + .to_owned(), + linux + .gid_mappings() + .as_ref() + .ok_or(ValidateSpecError::NoGIDMapping)? + .to_owned(), + ) + } else { + ( + get_id_mapping("/proc/self/uid_map").unwrap(), + get_id_mapping("/proc/self/gid_map").unwrap(), + ) + }; if uid_mappings.is_empty() { return Err(ValidateSpecError::NoUIDMappings); @@ -285,8 +336,8 @@ fn validate_spec_for_rootless(spec: &Spec) -> std::result::Result<(), ValidateSp spec.mounts() .as_ref() .ok_or(ValidateSpecError::NoMountSpec)?, - uid_mappings, - gid_mappings, + &uid_mappings, + &gid_mappings, )?; if let Some(additional_gids) = spec @@ -294,12 +345,12 @@ fn validate_spec_for_rootless(spec: &Spec) -> std::result::Result<(), ValidateSp .as_ref() .and_then(|process| process.user().additional_gids().as_ref()) { - let privileged = nix::unistd::geteuid().is_root(); + let privileged = !rootless_required(); match (privileged, additional_gids.is_empty()) { (true, false) => { for gid in additional_gids { - if !is_id_mapped(*gid, gid_mappings) { + if !is_id_mapped(*gid, &gid_mappings) { tracing::error!(?gid,"gid is specified as supplementary group, but is not mapped in the user namespace"); return Err(ValidateSpecError::GidNotMapped(*gid)); } diff --git a/crates/libcontainer/src/utils.rs b/crates/libcontainer/src/utils.rs index 27f735dfa..46867164f 100644 --- a/crates/libcontainer/src/utils.rs +++ b/crates/libcontainer/src/utils.rs @@ -143,6 +143,13 @@ pub fn get_user_home(uid: u32) -> Option { } } +pub fn is_in_new_userns() -> bool { + let uid_map_path = "/proc/self/uid_map"; + let content = std::fs::read_to_string(uid_map_path) + .unwrap_or_else(|_| panic!("failed to read {}", uid_map_path)); + !content.contains("4294967295") +} + /// If None, it will generate a default path for cgroups. pub fn get_cgroup_path( cgroups_path: &Option, diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index d99a0ec57..2395ab524 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -64,6 +64,7 @@ fn create_cgroup_manager>( cgroup_path: container.spec()?.cgroup_path, systemd_cgroup: container.systemd(), container_name: container.id().to_string(), + use_system_bus: !container.is_rootless(), }, )?) }