diff --git a/Cargo.lock b/Cargo.lock index 7edc2b90d..dd49bf02c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,9 +19,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.45" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7" +checksum = "ecc78c299ae753905840c5d3ba036c51f61ce5a98a83f98d9c9d29dffd427f71" [[package]] name = "ascii" @@ -138,9 +138,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" +checksum = "3825b1e8580894917dc4468cb634a1b4e9745fddc854edad72d9c04644c0319f" dependencies = [ "cfg-if", ] @@ -387,9 +387,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "futures" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca" +checksum = "8cd0210d8c325c245ff06fd95a3b13689a1a276ac8cfa8e8720cb840bfb84b9e" dependencies = [ "futures-channel", "futures-core", @@ -402,9 +402,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888" +checksum = "7fc8cd39e3dbf865f7340dce6a2d401d24fd37c6fe6c4f0ee0de8bfca2252d27" dependencies = [ "futures-core", "futures-sink", @@ -412,15 +412,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d" +checksum = "629316e42fe7c2a0b9a65b47d159ceaa5453ab14e8f0a3c5eedbb8cd55b4a445" [[package]] name = "futures-executor" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c" +checksum = "7b808bf53348a36cab739d7e04755909b9fcaaa69b7d7e588b37b6ec62704c97" dependencies = [ "futures-core", "futures-task", @@ -430,18 +430,16 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377" +checksum = "e481354db6b5c353246ccf6a728b0c5511d752c08da7260546fc0933869daa11" [[package]] name = "futures-macro" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb" +checksum = "a89f17b21645bc4ed773c69af9c9a0effd4a3f1a3876eadd453469f8854e7fdd" dependencies = [ - "autocfg", - "proc-macro-hack", "proc-macro2", "quote", "syn", @@ -449,23 +447,22 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11" +checksum = "996c6442437b62d21a32cd9906f9c41e7dc1e19a9579843fad948696769305af" [[package]] name = "futures-task" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99" +checksum = "dabf1872aaab32c886832f2276d2f5399887e2bd613698a02359e4ea83f8de12" [[package]] name = "futures-util" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481" +checksum = "41d22213122356472061ac0f1ab2cee28d2bac8491410fd68c2af53d1cedb83e" dependencies = [ - "autocfg", "futures-channel", "futures-core", "futures-io", @@ -475,8 +472,6 @@ dependencies = [ "memchr", "pin-project-lite", "pin-utils", - "proc-macro-hack", - "proc-macro-nested", "slab", ] @@ -493,9 +488,9 @@ dependencies = [ [[package]] name = "getset" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b328c01a4d71d2d8173daa93562a73ab0fe85616876f02500f53d82948c504" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" dependencies = [ "proc-macro-error", "proc-macro2", @@ -611,9 +606,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.107" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbe5e23404da5b4f555ef85ebed98fb4083e55a00c317800bc2a50ede9f3d219" +checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" [[package]] name = "libcgroups" @@ -936,18 +931,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" - [[package]] name = "proc-macro2" version = "1.0.32" @@ -959,9 +942,9 @@ dependencies = [ [[package]] name = "procfs" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f2e7eea7c1d7beccbd5acc1e37ac844afccf176525674aad26ece3de1fc7733" +checksum = "7718b88dae7b9b9be183ee274b10554b9aded035539230245275d7bc543fc0a4" dependencies = [ "bitflags", "byteorder", @@ -1072,9 +1055,9 @@ checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "ryu" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +checksum = "3c9613b5a66ab9ba26415184cfc41156594925a9cf3a2057e57f31ff145f6568" [[package]] name = "scopeguard" @@ -1104,9 +1087,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.71" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063bf466a64011ac24040a49009724ee60a57da1b437617ceb32e53ad61bfb19" +checksum = "d0ffa0837f2dfa6fb90868c2b5468cad482e175f7dad97e7421951e663f2b527" dependencies = [ "itoa", "ryu", @@ -1155,9 +1138,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.81" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2afee18b8beb5a596ecb4a2dce128c719b4ba399d34126b9e4396e3f9860966" +checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" dependencies = [ "proc-macro2", "quote", diff --git a/crates/integration_test/Cargo.toml b/crates/integration_test/Cargo.toml index 8c6699996..cd81f72e8 100644 --- a/crates/integration_test/Cargo.toml +++ b/crates/integration_test/Cargo.toml @@ -13,7 +13,7 @@ version = "=3.0.0-beta.5" default-features = true [dependencies] -procfs = "0.11.0" +procfs = "0.11.1" uuid = "0.8" rand = "0.8.0" tar = "0.4" diff --git a/crates/libcgroups/Cargo.toml b/crates/libcgroups/Cargo.toml index a8146577b..2e102c7e1 100644 --- a/crates/libcgroups/Cargo.toml +++ b/crates/libcgroups/Cargo.toml @@ -9,7 +9,7 @@ cgroupsv2_devices = ["rbpf", "libbpf-sys", "errno", "libc"] [dependencies] nix = "0.23.0" -procfs = "0.11.0" +procfs = "0.11.1" log = "0.4" anyhow = "1.0" oci-spec = { git = "https://github.com/containers/oci-spec-rs", rev = "54c5e386f01ab37c9305cc4a83404eb157e42440" } @@ -19,7 +19,7 @@ serde = { version = "1.0", features = ["derive"] } rbpf = {version = "0.1.0", optional = true } libbpf-sys = { version = "0.5.0-2", optional = true } errno = { version = "0.2.8", optional = true } -libc = { version = "0.2.107", optional = true } +libc = { version = "0.2.108", optional = true } [dev-dependencies] oci-spec = { git = "https://github.com/containers/oci-spec-rs", rev = "54c5e386f01ab37c9305cc4a83404eb157e42440", features = ["proptests"] } diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index 45f05e663..81d1b054f 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -184,11 +184,18 @@ pub fn create_cgroup_manager>( if !systemd::booted() { bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); } - log::info!("systemd cgroup manager will be used"); + + let use_system = nix::unistd::geteuid().is_root(); + + log::info!( + "systemd cgroup manager with system bus {} will be used", + use_system + ); return Ok(Box::new(systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), cgroup_path.into(), container_name.into(), + use_system, )?)); } log::info!("cgroup manager V2 will be used"); diff --git a/crates/libcgroups/src/systemd/dbus/client.rs b/crates/libcgroups/src/systemd/dbus/client.rs index 6576ed0d2..d2ab1e13a 100644 --- a/crates/libcgroups/src/systemd/dbus/client.rs +++ b/crates/libcgroups/src/systemd/dbus/client.rs @@ -3,18 +3,54 @@ use anyhow::{Context, Result}; use dbus::arg::{RefArg, Variant}; use dbus::blocking::{Connection, Proxy}; use std::collections::HashMap; +use std::path::PathBuf; use std::time::Duration; +pub trait SystemdClient { + fn is_system(&self) -> bool; + + fn start_transient_unit( + &self, + container_name: &str, + pid: u32, + parent: &str, + unit_name: &str, + ) -> Result<()>; + + fn stop_transient_unit(&self, unit_name: &str) -> Result<()>; + + fn set_unit_properties( + &self, + unit_name: &str, + properties: &HashMap<&str, Box>, + ) -> Result<()>; + + fn systemd_version(&self) -> Result; + + fn control_cgroup_root(&self) -> Result; +} + /// Client is a wrapper providing higher level API and abatraction around dbus. /// For more information see https://www.freedesktop.org/wiki/Software/systemd/dbus/ pub struct Client { conn: Connection, + system: bool, } impl Client { - pub fn new() -> Result { + /// Uses the system bus to communicate with systemd + pub fn new_system() -> Result { let conn = Connection::new_system()?; - Ok(Client { conn }) + Ok(Client { conn, system: true }) + } + + /// Uses the session bus to communicate with systemd + pub fn new_session() -> Result { + let conn = Connection::new_session()?; + Ok(Client { + conn, + system: false, + }) } fn create_proxy(&self) -> Proxy<&Connection> { @@ -24,11 +60,17 @@ impl Client { Duration::from_millis(5000), ) } +} + +impl SystemdClient for Client { + fn is_system(&self) -> bool { + self.system + } /// start_transient_unit is a higher level API for starting a unit /// for a specific container under systemd. /// See https://www.freedesktop.org/wiki/Software/systemd/dbus for more details. - pub fn start_transient_unit( + fn start_transient_unit( &self, container_name: &str, pid: u32, @@ -70,6 +112,8 @@ impl Client { properties.push(("DefaultDependencies", Variant(Box::new(false)))); properties.push(("PIDs", Variant(Box::new(vec![pid])))); + log::debug!("START UNIT: {:?}", properties); + proxy .start_transient_unit(unit_name, "replace", properties, vec![]) .with_context(|| { @@ -81,7 +125,7 @@ impl Client { Ok(()) } - pub fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { + fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { let proxy = self.create_proxy(); proxy @@ -90,7 +134,7 @@ impl Client { Ok(()) } - pub fn set_unit_properties( + fn set_unit_properties( &self, unit_name: &str, properties: &HashMap<&str, Box>, @@ -108,7 +152,7 @@ impl Client { Ok(()) } - pub fn systemd_version(&self) -> Result { + fn systemd_version(&self) -> Result { let proxy = self.create_proxy(); let version = proxy @@ -123,4 +167,14 @@ impl Client { Ok(version) } + + fn control_cgroup_root(&self) -> Result { + let proxy = self.create_proxy(); + + let cgroup_root = proxy + .control_group() + .context("failed to get systemd control group")?; + PathBuf::try_from(&cgroup_root) + .with_context(|| format!("parse systemd control cgroup {} into path", cgroup_root)) + } } diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index dbe7d247c..4b0d9c93f 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -2,7 +2,7 @@ #![allow(unused_variables)] use std::{ collections::HashMap, - fmt::Display, + fmt::{Debug, Display}, fs::{self}, os::unix::fs::PermissionsExt, path::Component::RootDir, @@ -18,7 +18,7 @@ use super::{ controller_type::{ControllerType, CONTROLLER_TYPES}, cpu::Cpu, cpuset::CpuSet, - dbus::client::Client, + dbus::client::{Client, SystemdClient}, memory::Memory, pids::Pids, }; @@ -32,14 +32,21 @@ const CGROUP_PROCS: &str = "cgroup.procs"; const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; -/// SystemDCGroupManager is a driver for managing cgroups via systemd. pub struct Manager { + /// Root path of the cgroup hierarchy e.g. /sys/fs/cgroup root_path: PathBuf, + /// Path relative to the root path e.g. /system.slice/youki-569d5ce3afe1074769f67.scope for rootfull containers + /// and e.g. /user.slice/user-1000/user@1000.service/youki-569d5ce3afe1074769f67.scope for rootless containers cgroups_path: PathBuf, + /// Combination of root path and cgroups path full_path: PathBuf, + /// Destructured cgroups path as specified in the runtime spec e.g. system.slice:youki:569d5ce3afe1074769f67 destructured_path: CgroupsPath, + /// Name of the container e.g. 569d5ce3afe1074769f67 container_name: String, + /// Name of the systemd unit e.g. youki-569d5ce3afe1074769f67.scope unit_name: String, + /// Client for communicating with systemd client: Client, } @@ -54,32 +61,10 @@ struct CgroupsPath { name: String, } -impl Display for CgroupsPath { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}:{}", self.parent, self.prefix, self.name) - } -} +impl TryFrom<&Path> for CgroupsPath { + type Error = anyhow::Error; -impl Manager { - pub fn new(root_path: PathBuf, cgroups_path: PathBuf, container_name: String) -> Result { - // TODO: create the systemd unit using a dbus client. - let destructured_path = Self::destructure_cgroups_path(cgroups_path)?; - let (cgroups_path, parent) = Self::construct_cgroups_path(&destructured_path)?; - let full_path = root_path.join_safely(&cgroups_path)?; - - Ok(Manager { - root_path, - cgroups_path, - full_path, - container_name, - unit_name: Self::get_unit_name(&destructured_path), - destructured_path, - client: Client::new().context("failed to create dbus client")?, - }) - } - - fn destructure_cgroups_path(cgroups_path: PathBuf) -> Result { - log::debug!("CGROUPS PATH IS {:?}", cgroups_path); + fn try_from(cgroups_path: &Path) -> Result { // cgroups path may never be empty as it is defaulted to `/youki` // see 'get_cgroup_path' under utils.rs. // if cgroups_path was provided it should be of the form [slice]:[prefix]:[name], @@ -92,11 +77,11 @@ impl Manager { name = cgroups_path .strip_prefix("/youki/")? .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroupsPath field"))?; + .ok_or_else(|| anyhow!("failed to parse cgroups path {:?}", cgroups_path))?; } else { let parts = cgroups_path .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroupsPath field"))? + .ok_or_else(|| anyhow!("failed to parse cgroups path {:?}", cgroups_path))? .split(':') .collect::>(); parent = parts[0]; @@ -110,6 +95,59 @@ impl Manager { name: name.to_owned(), }) } +} + +impl Display for CgroupsPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.parent, self.prefix, self.name) + } +} + +// custom debug impl as Manager contains fields that do not implement Debug +// and therefore Debug cannot be derived +impl Debug for Manager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Manager") + .field("root_path", &self.root_path) + .field("cgroups_path", &self.cgroups_path) + .field("full_path", &self.full_path) + .field("destructured_path", &self.destructured_path) + .field("container_name", &self.container_name) + .field("unit_name", &self.unit_name) + .finish() + } +} + +impl Manager { + pub fn new( + root_path: PathBuf, + cgroups_path: PathBuf, + container_name: String, + use_system: bool, + ) -> Result { + let destructured_path = cgroups_path + .as_path() + .try_into() + .with_context(|| format!("failed to destructure cgroups path {:?}", cgroups_path))?; + let client = match use_system { + true => Client::new_system().context("failed to create system dbus client")?, + false => Client::new_session().context("failed to create session dbus client")?, + }; + + let (cgroups_path, parent) = Self::construct_cgroups_path(&destructured_path, &client) + .context("failed to construct cgroups path")?; + let full_path = root_path.join_safely(&cgroups_path)?; + + Ok(Manager { + root_path, + cgroups_path, + full_path, + container_name, + unit_name: Self::get_unit_name(&destructured_path), + destructured_path, + client, + }) + } /// get_unit_name returns the unit (scope) name from the path provided by the user /// for example: foo:docker:bar returns in '/docker-bar.scope' @@ -121,6 +159,33 @@ impl Manager { cgroups_path.name.clone() } + // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. + // an example of the final path: "/system.slice/docker-foo.scope" + fn construct_cgroups_path( + cgroups_path: &CgroupsPath, + client: &dyn SystemdClient, + ) -> Result<(PathBuf, PathBuf)> { + let mut parent = match client.is_system() { + true => PathBuf::from("/system.slice"), + false => PathBuf::from("/user.slice"), + }; + + // if the user provided a '.slice' (as in a branch of a tree) + // we need to convert it to a filesystem path. + if !cgroups_path.parent.is_empty() { + parent = Self::expand_slice(&cgroups_path.parent)?; + } + + let systemd_root = client.control_cgroup_root()?; + let unit_name = Self::get_unit_name(cgroups_path); + let cgroups_path = systemd_root + .join_safely(&parent) + .with_context(|| format!("failed to join {:?} with {:?}", systemd_root, parent))? + .join_safely(&unit_name) + .with_context(|| format!("failed to join {:?} with {:?}", parent, unit_name))?; + Ok((cgroups_path, parent)) + } + // systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. For example, 'test-a-b.slice' becomes // '/test.slice/test-a.slice/test-a-b.slice'. @@ -141,7 +206,7 @@ impl Manager { } for component in slice_name.split('-') { if component.is_empty() { - anyhow!("Invalid slice name: {}", slice); + anyhow!("invalid slice name: {}", slice); } // Append the component to the path and to the prefix. path = format!("{}/{}{}{}", path, prefix, component, suffix); @@ -150,23 +215,6 @@ impl Manager { Ok(Path::new(&path).to_path_buf()) } - // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. - // an example of the final path: "/machine.slice/docker-foo.scope" - fn construct_cgroups_path(cgroups_path: &CgroupsPath) -> Result<(PathBuf, PathBuf)> { - // the root slice is under 'machine.slice'. - let mut parent = PathBuf::from("/system.slice"); - // if the user provided a '.slice' (as in a branch of a tree) - // we need to convert it to a filesystem path. - if !cgroups_path.parent.is_empty() { - parent = Self::expand_slice(&cgroups_path.parent)?; - } - let unit_name = Self::get_unit_name(cgroups_path); - let cgroups_path = parent - .join_safely(&unit_name) - .with_context(|| format!("failed to join {:?} with {:?}", parent, unit_name))?; - Ok((cgroups_path, parent)) - } - /// create_unified_cgroup verifies sure that *each level* in the downward path from the root cgroup /// down to the cgroup_path provided by the user is a valid cgroup hierarchy, /// containing the attached controllers and that it contains the container pid. @@ -329,8 +377,48 @@ impl CgroupManager for Manager { #[cfg(test)] mod tests { + use crate::systemd::dbus::client::SystemdClient; + use super::*; + struct TestSystemdClient {} + + impl SystemdClient for TestSystemdClient { + fn is_system(&self) -> bool { + true + } + + fn start_transient_unit( + &self, + container_name: &str, + pid: u32, + parent: &str, + unit_name: &str, + ) -> Result<()> { + Ok(()) + } + + fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { + Ok(()) + } + + fn set_unit_properties( + &self, + unit_name: &str, + properties: &HashMap<&str, Box>, + ) -> Result<()> { + Ok(()) + } + + fn systemd_version(&self) -> Result { + Ok(245) + } + + fn control_cgroup_root(&self) -> Result { + Ok(PathBuf::from("/")) + } + } + #[test] fn expand_slice_works() -> Result<()> { assert_eq!( @@ -343,12 +431,12 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from("test-a-b.slice:docker:foo")) - .expect(""); + let cgroups_path = Path::new("test-a-b.slice:docker:foo") + .try_into() + .context("construct path")?; assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/test.slice/test-a.slice/test-a-b.slice/docker-foo.scope"), ); @@ -357,11 +445,12 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from("machine.slice:libpod:foo")).expect(""); + let cgroups_path = Path::new("machine.slice:libpod:foo") + .try_into() + .context("construct path")?; assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/machine.slice/libpod-foo.scope"), ); @@ -370,11 +459,12 @@ mod tests { #[test] fn get_cgroups_path_works_with_scope() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from(":docker:foo")).expect(""); + let cgroups_path = Path::new(":docker:foo") + .try_into() + .context("construct path")?; assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/system.slice/docker-foo.scope"), ); diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 58f732191..18aae2487 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -14,13 +14,13 @@ crossbeam-channel = "0.5" dbus = "0.9.5" fastrand = "1.4.1" futures = { version = "0.3", features = ["thread-pool"] } -libc = "0.2.107" +libc = "0.2.108" log = "0.4" mio = { version = "0.8.0", features = ["os-ext", "os-poll"] } nix = "0.23.0" oci-spec = { git = "https://github.com/containers/oci-spec-rs", rev = "54c5e386f01ab37c9305cc4a83404eb157e42440" } path-clean = "0.1.0" -procfs = "0.11.0" +procfs = "0.11.1" prctl = "1.0.0" libcgroups = { version = "0.1.0", path = "../libcgroups" } libseccomp = { version = "0.1.0", path = "../libseccomp" } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 633933585..47bdb312c 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -57,7 +57,7 @@ impl<'a> ContainerBuilderImpl<'a> { let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); let cmanager = libcgroups::common::create_cgroup_manager( &cgroups_path, - self.use_systemd, + self.use_systemd || self.rootless.is_some(), &self.container_id, )?; let process = self.spec.process().as_ref().context("No process in spec")?; @@ -142,7 +142,7 @@ impl<'a> ContainerBuilderImpl<'a> { let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); let cmanager = libcgroups::common::create_cgroup_manager( &cgroups_path, - self.use_systemd, + self.use_systemd || self.rootless.is_some(), &self.container_id, )?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 547e7a9db..1f265ab07 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -22,6 +22,23 @@ pub fn container_intermediate_process( let linux = spec.linux().as_ref().context("no linux in spec")?; let namespaces = Namespaces::from(linux.namespaces().as_ref()); + // this needs to be done before we create the init process, so that the init + // process will already be captured by the cgroup. It also needs to be done + // before we enter the user namespace because if a privileged user starts a + // rootless container on a cgroup v1 system we can still fulfill resource + // restrictions through the cgroup fs support (delegation through systemd is + // not supported for v1 by us). This only works if the user has not yet been + // mapped to an unprivileged user by the user namespace however. + // In addition this needs to be done before we enter the cgroup namespace as + // the cgroup of the process will form the root of the cgroup hierarchy in + // the cgroup namespace. + apply_cgroups( + args.cgroup_manager.as_ref(), + linux.resources().as_ref(), + args.init, + ) + .context("failed to apply cgroups")?; + // if new user is specified in specification, this will be true and new // namespace will be created, check // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more @@ -47,7 +64,7 @@ pub fn container_intermediate_process( // root in the user namespace likely is mapped to an non-priviliged user // on the parent user namespace. command.set_id(Uid::from_raw(0), Gid::from_raw(0)).context( - "Failed to configure uid and gid root in the beginning of a new user namespace", + "failed to configure uid and gid root in the beginning of a new user namespace", )?; } @@ -66,17 +83,6 @@ pub fn container_intermediate_process( .with_context(|| format!("Failed to enter pid namespace: {:?}", pid_namespace))?; } - // this needs to be done before we create the init process, so that the init - // process will already be captured by the cgroup - if args.rootless.is_none() { - apply_cgroups( - args.cgroup_manager.as_ref(), - linux.resources().as_ref(), - args.init, - ) - .context("failed to apply cgroups")? - } - // We have to record the pid of the child (container init process), since // the child will be inside the pid namespace. We can't rely on child_ready // to send us the correct pid. diff --git a/crates/libcontainer/src/rootless.rs b/crates/libcontainer/src/rootless.rs index 5ea61b0fd..88240ee9a 100644 --- a/crates/libcontainer/src/rootless.rs +++ b/crates/libcontainer/src/rootless.rs @@ -2,6 +2,7 @@ use crate::{namespaces::Namespaces, utils}; use anyhow::{bail, Context, Result}; use nix::unistd::Pid; use oci_spec::runtime::{Linux, LinuxIdMapping, LinuxNamespace, LinuxNamespaceType, Mount, Spec}; +use std::fs; use std::path::Path; use std::process::Command; use std::{env, path::PathBuf}; @@ -36,7 +37,6 @@ impl<'a> Rootless<'a> { if user_namespace.is_some() && user_namespace.unwrap().path().is_none() { log::debug!("rootless container should be created"); - log::warn!("resource constraints are unimplemented for rootless containers"); validate(spec).context("The spec failed to comply to rootless requirement")?; let mut rootless = Rootless::from(linux); @@ -105,6 +105,22 @@ pub fn rootless_required() -> bool { matches!(std::env::var("YOUKI_USE_ROOTLESS").as_deref(), Ok("true")) } +pub fn unprivileged_user_ns_enabled() -> Result { + let user_ns_sysctl = Path::new("/proc/sys/kernel/unprivileged_userns_clone"); + if !user_ns_sysctl.exists() { + return Ok(true); + } + + let content = + fs::read_to_string(user_ns_sysctl).context("failed to read unprivileged userns clone")?; + + match content.trim().parse::()? { + 0 => Ok(false), + 1 => Ok(true), + v => bail!("failed to parse unprivileged userns value: {}", v), + } +} + /// Validates that the spec contains the required information for /// running in rootless mode fn validate(spec: &Spec) -> Result<()> { diff --git a/crates/libseccomp/Cargo.toml b/crates/libseccomp/Cargo.toml index 129b414ae..37dd867b5 100644 --- a/crates/libseccomp/Cargo.toml +++ b/crates/libseccomp/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" build = "build.rs" [dependencies] -libc = "0.2.107" +libc = "0.2.108" [build-dependencies] pkg-config = "0.3.22" diff --git a/crates/test_framework/Cargo.toml b/crates/test_framework/Cargo.toml index 635e4b572..f48bf2aeb 100644 --- a/crates/test_framework/Cargo.toml +++ b/crates/test_framework/Cargo.toml @@ -6,5 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -anyhow = "1.0.45" +anyhow = "1.0.50" crossbeam = "0.8.1" \ No newline at end of file diff --git a/crates/youki/Cargo.toml b/crates/youki/Cargo.toml index 8a9d85261..ddaa90900 100644 --- a/crates/youki/Cargo.toml +++ b/crates/youki/Cargo.toml @@ -21,7 +21,7 @@ nix = "0.23.0" oci-spec = { git = "https://github.com/containers/oci-spec-rs", rev = "d6fb1e91742313cd0d0085937e2d6df5d4669720" } once_cell = "1.6.0" pentacle = "1.0.0" -procfs = "0.11.0" +procfs = "0.11.1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" tabwriter = "1" diff --git a/crates/youki/src/commands/info.rs b/crates/youki/src/commands/info.rs index 2f9bca37f..146159853 100644 --- a/crates/youki/src/commands/info.rs +++ b/crates/youki/src/commands/info.rs @@ -3,6 +3,7 @@ use std::{collections::HashSet, fs, path::Path}; use anyhow::Result; use clap::Parser; +use libcontainer::rootless; use procfs::{CpuInfo, Meminfo}; use libcgroups::{common::CgroupSetup, v2::controller_type::ControllerType}; @@ -176,7 +177,12 @@ pub fn print_namespaces() { println!(" {:<16}enabled", "mount"); print_feature_status(&content, "CONFIG_UTS_NS", FeatureDisplay::new("uts")); print_feature_status(&content, "CONFIG_IPC_NS", FeatureDisplay::new("ipc")); - print_feature_status(&content, "CONFIG_USER_NS", FeatureDisplay::new("user")); + + let user_display = match rootless::unprivileged_user_ns_enabled() { + Ok(false) => FeatureDisplay::with_status("user", "enabled (root only)", "disabled"), + _ => FeatureDisplay::new("user"), + }; + print_feature_status(&content, "CONFIG_USER_NS", user_display); print_feature_status(&content, "CONFIG_PID_NS", FeatureDisplay::new("pid")); print_feature_status(&content, "CONFIG_NET_NS", FeatureDisplay::new("network")); // While the CONFIG_CGROUP_NS kernel feature exists, it is obsolete and should not be used. CGroup namespaces diff --git a/crates/youki/src/logger.rs b/crates/youki/src/logger.rs index 7647c0ee0..1f7753e3d 100644 --- a/crates/youki/src/logger.rs +++ b/crates/youki/src/logger.rs @@ -8,6 +8,8 @@ use std::io::Write; use std::path::PathBuf; use std::str::FromStr; +const LOG_LEVEL_ENV_NAME: &str = "YOUKI_LOG_LEVEL"; + /// If in debug mode, default level is debug to get maximum logging #[cfg(debug_assertions)] const DEFAULT_LOG_LEVEL: &str = "debug"; @@ -27,13 +29,7 @@ pub fn init( log_file: Option, log_format: Option, ) -> Result<()> { - let filter: Cow = if log_debug_flag { - "debug".into() - } else if let Ok(level) = std::env::var("YOUKI_LOG_LEVEL") { - level.into() - } else { - DEFAULT_LOG_LEVEL.into() - }; + let log_level = detect_log_level(log_debug_flag); let formatter = match log_format.as_deref() { None | Some(LOG_FORMAT_TEXT) => text_write, Some(LOG_FORMAT_JSON) => json_write, @@ -51,7 +47,7 @@ pub fn init( env_logger::Target::Stderr }; env_logger::Builder::new() - .filter_level(LevelFilter::from_str(filter.as_ref()).context("failed to parse log level")?) + .filter_level(log_level.context("failed to parse log level")?) .format(formatter) .target(target) .init(); @@ -59,6 +55,17 @@ pub fn init( Ok(()) } +fn detect_log_level(is_debug: bool) -> Result { + let filter: Cow = if is_debug { + "debug".into() + } else if let Ok(level) = std::env::var(LOG_LEVEL_ENV_NAME) { + level.into() + } else { + DEFAULT_LOG_LEVEL.into() + }; + Ok(LevelFilter::from_str(filter.as_ref())?) +} + fn json_write(f: &mut F, record: &log::Record) -> std::io::Result<()> where F: Write, @@ -93,3 +100,56 @@ where Ok(()) } + +#[cfg(test)] +mod tests { + use serial_test::serial; + + use super::*; + use std::env; + struct LogLevelGuard { + original_level: Option, + } + + impl LogLevelGuard { + fn new(level: &str) -> Result { + let original_level = env::var(LOG_LEVEL_ENV_NAME).ok(); + env::set_var(LOG_LEVEL_ENV_NAME, level); + Ok(Self { original_level }) + } + } + impl Drop for LogLevelGuard { + fn drop(self: &mut LogLevelGuard) { + if let Some(level) = self.original_level.as_ref() { + env::set_var(LOG_LEVEL_ENV_NAME, level); + } else { + env::remove_var(LOG_LEVEL_ENV_NAME); + } + } + } + + #[test] + fn test_detect_log_level_is_debug() { + let _guard = LogLevelGuard::new("error").unwrap(); + assert_eq!(detect_log_level(true).unwrap(), LevelFilter::Debug) + } + + #[test] + #[serial] + fn test_detect_log_level_default() { + let _guard = LogLevelGuard::new("error").unwrap(); + env::remove_var(LOG_LEVEL_ENV_NAME); + if cfg!(debug_assertions) { + assert_eq!(detect_log_level(false).unwrap(), LevelFilter::Debug) + } else { + assert_eq!(detect_log_level(false).unwrap(), LevelFilter::Warn) + } + } + + #[test] + #[serial] + fn test_detect_log_level_from_env() { + let _guard = LogLevelGuard::new("error").unwrap(); + assert_eq!(detect_log_level(false).unwrap(), LevelFilter::Error) + } +} diff --git a/crates/youki/src/main.rs b/crates/youki/src/main.rs index d61d3584b..f1ea9fd67 100644 --- a/crates/youki/src/main.rs +++ b/crates/youki/src/main.rs @@ -5,7 +5,7 @@ mod commands; mod logger; use std::fs; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use anyhow::bail; use anyhow::Context; @@ -112,6 +112,11 @@ fn main() -> Result<()> { eprintln!("log init failed: {:?}", e); } + log::debug!( + "started by user {} with {:?}", + nix::unistd::geteuid(), + std::env::args_os() + ); let root_path = determine_root_path(opts.root)?; let systemd_cgroup = opts.systemd_cgroup; @@ -145,15 +150,18 @@ fn determine_root_path(root_path: Option) -> Result { } // see https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html + let uid = getuid().as_raw(); if let Ok(path) = std::env::var("XDG_RUNTIME_DIR") { - return Ok(PathBuf::from(path)); + let path = Path::new(&path).join("youki"); + if create_dir_all_with_mode(&path, uid, Mode::S_IRWXU).is_ok() { + return Ok(path); + } } // XDG_RUNTIME_DIR is not set, try the usual location - let uid = getuid().as_raw(); - let runtime_dir = PathBuf::from(format!("/run/user/{}", uid)); - if create_dir_all_with_mode(&runtime_dir, uid, Mode::S_IRWXU).is_ok() { - return Ok(runtime_dir); + let path = PathBuf::from(format!("/run/user/{}/youki", uid)); + if create_dir_all_with_mode(&path, uid, Mode::S_IRWXU).is_ok() { + return Ok(path); } if let Ok(path) = std::env::var("HOME") { @@ -165,7 +173,7 @@ fn determine_root_path(root_path: Option) -> Result { } } - let tmp_dir = PathBuf::from(format!("/tmp/youki/{}", uid)); + let tmp_dir = PathBuf::from(format!("/tmp/youki-{}", uid)); if create_dir_all_with_mode(&tmp_dir, uid, Mode::S_IRWXU).is_ok() { return Ok(tmp_dir); } diff --git a/docs/.drawio.svg b/docs/.drawio.svg index c2fcf5cc9..505da318f 100644 --- a/docs/.drawio.svg +++ b/docs/.drawio.svg @@ -1,4 +1,4 @@ - + @@ -115,13 +115,13 @@ - - - + + + -
+
@@ -133,18 +133,18 @@
- + fork(2) - - - + + + -
+
@@ -154,16 +154,16 @@
- + send identifier mapping request - + -
+
unshare(CLONE_NEWUSER) @@ -171,18 +171,18 @@
- + unshare(CLONE_NEWUSER) - + - + -
+
write uid mapping @@ -190,16 +190,16 @@
- + write uid mapping - + -
+
write gid mapping @@ -207,18 +207,18 @@
- + write gid mapping - - - + + + -
+
@@ -228,7 +228,7 @@
- + send mapping written @@ -410,13 +410,13 @@ - + - + -
+
setup cgroup @@ -424,17 +424,17 @@
- + setup cgroup - - + + -
+
unshare(CLONE_NEWPID) @@ -442,16 +442,16 @@
- + unshare(CLONE_NEWPID) - + -
+
set uid and gid @@ -459,7 +459,7 @@
- + set uid and gid