From eab7e49506495633cc2aa77c58af89e46d226f5d Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 18 Apr 2024 12:30:18 +0100 Subject: [PATCH 1/4] Kill container process directly instead of using docker API --- src/docker/container.rs | 16 ++++++++++------ src/docker/docker.rs | 7 ++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/docker/container.rs b/src/docker/container.rs index f8f67f4..5c786b0 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -3,7 +3,7 @@ use std::pin::pin; use std::sync::Arc; use anyhow::{anyhow, Context, Error, Result}; -use rustix::process::Signal; +use rustix::process::{Pid, Signal}; use tokio::signal::unix::{signal, SignalKind}; use tokio::sync::Mutex; use tokio::task::JoinHandle; @@ -17,12 +17,18 @@ use crate::cgroup::{ pub struct Container { docker: bollard::Docker, id: String, + pid: Pid, user: String, cgroup_device_filter: Mutex>>, } impl Container { - pub(super) fn new(docker: &bollard::Docker, id: String, user: String) -> Result { + pub(super) fn new( + docker: &bollard::Docker, + id: String, + pid: u32, + user: String, + ) -> Result { // Dropping the device filter will cause the container to have arbitrary device access. // So keep it alive until we're sure that the container is stopped. let cgroup_device_filter: Option> = @@ -46,6 +52,7 @@ impl Container { Ok(Self { docker: docker.clone(), id, + pid: Pid::from_raw(pid.try_into()?).context("Invalid PID")?, user: if user.is_empty() { // If user is not specified, use root. "root".to_owned() @@ -125,10 +132,7 @@ impl Container { } pub async fn kill(&self, signal: Signal) -> Result<()> { - let options = bollard::container::KillContainerOptions { - signal: format!("{}", signal as i32), - }; - self.docker.kill_container(&self.id, Some(options)).await?; + rustix::process::kill_process(self.pid, signal).context("Failed to kill container init")?; Ok(()) } diff --git a/src/docker/docker.rs b/src/docker/docker.rs index acd1266..d307e5d 100644 --- a/src/docker/docker.rs +++ b/src/docker/docker.rs @@ -16,7 +16,12 @@ impl Docker { .config .context("Failed to obtain container config")?; let user = config.user.context("Failed to obtain container user")?; - Container::new(&self.0, id, user) + let pid = response + .state + .context("Failed to obtain container state")? + .pid + .context("Failed to obtain container pid")?; + Container::new(&self.0, id, pid.try_into()?, user) } pub async fn run, T: AsRef<[U]>>(&self, args: T) -> Result { From 14541d467cc6a0fcdf66dc72c43d41f7f943b4cd Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 18 Apr 2024 13:38:36 +0100 Subject: [PATCH 2/4] Collect uid and gid from container using `id` command --- src/docker/container.rs | 84 ++++++++++++++++++++++++++++++----------- src/docker/docker.rs | 6 +-- 2 files changed, 64 insertions(+), 26 deletions(-) diff --git a/src/docker/container.rs b/src/docker/container.rs index 5c786b0..b4b0a6d 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -2,7 +2,8 @@ use std::path::Path; use std::pin::pin; use std::sync::Arc; -use anyhow::{anyhow, Context, Error, Result}; +use anyhow::{anyhow, ensure, Context, Error, Result}; +use rustix::fs::{Gid, Uid}; use rustix::process::{Pid, Signal}; use tokio::signal::unix::{signal, SignalKind}; use tokio::sync::Mutex; @@ -18,17 +19,13 @@ pub struct Container { docker: bollard::Docker, id: String, pid: Pid, - user: String, + uid: Uid, + gid: Gid, cgroup_device_filter: Mutex>>, } impl Container { - pub(super) fn new( - docker: &bollard::Docker, - id: String, - pid: u32, - user: String, - ) -> Result { + pub(super) async fn new(docker: &bollard::Docker, id: String, pid: u32) -> Result { // Dropping the device filter will cause the container to have arbitrary device access. // So keep it alive until we're sure that the container is stopped. let cgroup_device_filter: Option> = @@ -49,24 +46,64 @@ impl Container { }, }; - Ok(Self { + let mut this = Self { docker: docker.clone(), id, pid: Pid::from_raw(pid.try_into()?).context("Invalid PID")?, - user: if user.is_empty() { - // If user is not specified, use root. - "root".to_owned() - } else { - user - }, + uid: Uid::ROOT, + gid: Gid::ROOT, cgroup_device_filter: Mutex::new(cgroup_device_filter), - }) + }; + + let uid = this.exec(&["id", "-u"]).await?.trim().parse()?; + let gid = this.exec(&["id", "-g"]).await?.trim().parse()?; + // Only invalid uid/gid for Linux is negative one. + ensure!(uid != u32::MAX && gid != u32::MAX); + // SAFETY: We just checked that the uid/gid is not -1. + this.uid = unsafe { Uid::from_raw(uid) }; + this.gid = unsafe { Gid::from_raw(gid) }; + + Ok(this) } pub fn id(&self) -> &str { &self.id } + pub async fn exec(&self, cmd: &[T]) -> Result { + let cmd = cmd.iter().map(|s| s.to_string()).collect(); + let options = bollard::exec::CreateExecOptions { + cmd: Some(cmd), + attach_stdin: Some(true), + attach_stdout: Some(true), + attach_stderr: Some(true), + tty: Some(true), + detach_keys: Some("ctrl-c".to_string()), + ..Default::default() + }; + let response = self.docker.create_exec::(&self.id, options).await?; + let id = response.id; + + let options = bollard::exec::StartExecOptions { + detach: false, + ..Default::default() + }; + let response = self.docker.start_exec(&id, Some(options)).await?; + let bollard::exec::StartExecResults::Attached { + input: _, + mut output, + } = response + else { + unreachable!("we asked for attached IO streams"); + }; + + let mut result = String::new(); + while let Some(output) = output.next().await { + result.push_str(&output?.to_string()); + } + Ok(result) + } + pub async fn exec_as_root(&self, cmd: &[T]) -> Result { let cmd = cmd.iter().map(|s| s.to_string()).collect(); let options = bollard::exec::CreateExecOptions { @@ -160,11 +197,16 @@ impl Container { } pub async fn chown_to_user(&self, path: &str) -> Result<()> { - // Use `-h` to not follow symlink, and `user:` will use user's login group. - self.exec_as_root(&["chown", "-h", &format!("{}:", self.user), path]) - .await? - .collect() - .await?; + // Use `-h` to not follow symlink + self.exec_as_root(&[ + "chown", + "-h", + &format!("{}:{}", self.uid.as_raw(), self.gid.as_raw()), + path, + ]) + .await? + .collect() + .await?; Ok(()) } diff --git a/src/docker/docker.rs b/src/docker/docker.rs index d307e5d..15f42c2 100644 --- a/src/docker/docker.rs +++ b/src/docker/docker.rs @@ -12,16 +12,12 @@ impl Docker { pub async fn get>(&self, name: T) -> Result { let response = self.0.inspect_container(name.as_ref(), None).await?; let id = response.id.context("Failed to obtain container ID")?; - let config = response - .config - .context("Failed to obtain container config")?; - let user = config.user.context("Failed to obtain container user")?; let pid = response .state .context("Failed to obtain container state")? .pid .context("Failed to obtain container pid")?; - Container::new(&self.0, id, pid.try_into()?, user) + Container::new(&self.0, id, pid.try_into()?).await } pub async fn run, T: AsRef<[U]>>(&self, args: T) -> Result { From b8fe331dfadd4ca43c63100409d83032af83774a Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Sun, 21 Apr 2024 20:19:23 +0100 Subject: [PATCH 3/4] Perform I/O in mnt namespace directly instead of calling into docker The current approach requires the binaries to be available inside the container. Change to perform I/O in the container's mount namespace directly. This is more robust and less expensive. --- Cargo.toml | 2 +- src/docker/container.rs | 117 +++++++++++----------------------------- src/util/mod.rs | 1 + src/util/namespace.rs | 42 +++++++++++++++ 4 files changed, 74 insertions(+), 88 deletions(-) create mode 100644 src/util/namespace.rs diff --git a/Cargo.toml b/Cargo.toml index 31ffca3..f7eedfd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ tokio-util = { version = "0.7", features = ["full"] } async-stream = "0.3" udev = "0.8" bollard = "0.16" -rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process"] } +rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process", "thread"] } bitflags = "2" aya = { git = "https://github.com/aya-rs/aya.git" } diff --git a/src/docker/container.rs b/src/docker/container.rs index b4b0a6d..e1f3257 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -3,7 +3,7 @@ use std::pin::pin; use std::sync::Arc; use anyhow::{anyhow, ensure, Context, Error, Result}; -use rustix::fs::{Gid, Uid}; +use rustix::fs::{FileType, Gid, Mode, Uid}; use rustix::process::{Pid, Signal}; use tokio::signal::unix::{signal, SignalKind}; use tokio::sync::Mutex; @@ -104,38 +104,6 @@ impl Container { Ok(result) } - pub async fn exec_as_root(&self, cmd: &[T]) -> Result { - let cmd = cmd.iter().map(|s| s.to_string()).collect(); - let options = bollard::exec::CreateExecOptions { - cmd: Some(cmd), - attach_stdin: Some(true), - attach_stdout: Some(true), - attach_stderr: Some(true), - tty: Some(true), - detach_keys: Some("ctrl-c".to_string()), - user: Some("root".to_string()), - ..Default::default() - }; - let response = self.docker.create_exec::(&self.id, options).await?; - let id = response.id; - - let options = bollard::exec::StartExecOptions { - detach: false, - ..Default::default() - }; - let response = self.docker.start_exec(&id, Some(options)).await?; - let bollard::exec::StartExecResults::Attached { input, output } = response else { - unreachable!("we asked for attached IO streams"); - }; - - Ok(IoStream { - output, - input, - source: IoStreamSource::Exec(id), - docker: self.docker.clone(), - }) - } - pub async fn attach(&self) -> Result { let options = bollard::container::AttachContainerOptions:: { stdin: Some(true), @@ -196,67 +164,42 @@ impl Container { Ok(u8::try_from(code).unwrap_or(1)) } - pub async fn chown_to_user(&self, path: &str) -> Result<()> { - // Use `-h` to not follow symlink - self.exec_as_root(&[ - "chown", - "-h", - &format!("{}:{}", self.uid.as_raw(), self.gid.as_raw()), - path, - ]) - .await? - .collect() - .await?; - Ok(()) - } - - // Note: we use `&str` here instead of `Path` because docker API expects string instead `OsStr`. - pub async fn mkdir(&self, path: &str) -> Result<()> { - self.exec_as_root(&["mkdir", "-p", path]) - .await? - .collect() - .await?; - Ok(()) - } - - pub async fn mkdir_for(&self, path: &str) -> Result<()> { - if let Some(path) = std::path::Path::new(path).parent() { - self.mkdir(path.to_str().unwrap()).await?; - } - Ok(()) - } - pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> { - self.rm(node).await?; - let node = node.to_str().context("node is not UTF-8")?; - self.mkdir_for(node).await?; - self.exec_as_root(&["mknod", node, "c", &major.to_string(), &minor.to_string()]) - .await? - .collect() - .await?; - self.chown_to_user(node).await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + if let Some(parent) = node.parent() { + let _ = std::fs::create_dir_all(parent); + } + let _ = std::fs::remove_file(node); + rustix::fs::mknodat( + rustix::fs::CWD, + node, + FileType::CharacterDevice, + Mode::from(0o644), + rustix::fs::makedev(major, minor), + )?; + if !self.uid.is_root() { + rustix::fs::chown(node, Some(self.uid), Some(self.gid))?; + } + Ok(()) + })? } pub async fn symlink(&self, source: &Path, link: &Path) -> Result<()> { - let source = source.to_str().context("node is not UTF-8")?; - let link = link.to_str().context("symlink is not UTF-8")?; - self.mkdir_for(link).await?; - self.exec_as_root(&["ln", "-sf", source, link]) - .await? - .collect() - .await?; - self.chown_to_user(link).await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + if let Some(parent) = link.parent() { + let _ = std::fs::create_dir_all(parent); + } + let _ = std::fs::remove_file(link); + std::os::unix::fs::symlink(source, link)?; + // No need to chown symlink. Permission is determined by the target. + Ok(()) + })? } pub async fn rm(&self, node: &Path) -> Result<()> { - let node = node.to_str().context("node is not UTF-8")?; - self.exec_as_root(&["rm", "-f", node]) - .await? - .collect() - .await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + let _ = std::fs::remove_file(node); + }) } pub async fn device(&self, (major, minor): (u32, u32), access: Access) -> Result<()> { diff --git a/src/util/mod.rs b/src/util/mod.rs index 53a58f8..5d22c3e 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,2 +1,3 @@ pub mod escape; +pub mod namespace; pub mod tty_mode_guard; diff --git a/src/util/namespace.rs b/src/util/namespace.rs new file mode 100644 index 0000000..ab26164 --- /dev/null +++ b/src/util/namespace.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::os::fd::AsFd; + +use anyhow::Result; +use rustix::process::Pid; +use rustix::thread::{LinkNameSpaceType, UnshareFlags}; + +pub struct MntNamespace { + fd: File, +} + +impl MntNamespace { + /// Open the mount namespace of a process. + pub fn of_pid(pid: Pid) -> Result { + let path = format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()); + let fd = File::open(path)?; + Ok(MntNamespace { fd }) + } + + /// Enter the mount namespace. + pub fn enter T + Send>(&self, f: F) -> Result { + // To avoid messing with rest of the process, we do everything in a new thread. + // Use scoped thread to avoid 'static bound (we need to access fd). + std::thread::scope(|scope| { + scope + .spawn(|| -> Result { + // Unshare FS for this specific thread so we can switch to another namespace. + // Not doing this will cause EINVAL when switching to namespaces. + rustix::thread::unshare(UnshareFlags::FS)?; + + // Switch this particular thread to the container's mount namespace. + rustix::thread::move_into_link_name_space( + self.fd.as_fd(), + Some(LinkNameSpaceType::Mount), + )?; + Ok(f()) + }) + .join() + .map_err(|_| anyhow::anyhow!("work thread panicked"))? + }) + } +} From bd4ad3b75c74e8d3c4aec0bf19383eeafd023227 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Thu, 18 Apr 2024 14:33:00 +0100 Subject: [PATCH 4/4] Remove IoStreamSource We no longer use it with exec. --- src/docker/container.rs | 4 ++-- src/docker/iostream.rs | 44 ++++++++--------------------------------- src/docker/mod.rs | 2 -- 3 files changed, 10 insertions(+), 40 deletions(-) diff --git a/src/docker/container.rs b/src/docker/container.rs index e1f3257..8b8faf9 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -10,7 +10,7 @@ use tokio::sync::Mutex; use tokio::task::JoinHandle; use tokio_stream::StreamExt; -use super::{IoStream, IoStreamSource}; +use super::IoStream; use crate::cgroup::{ Access, DeviceAccessController, DeviceAccessControllerV1, DeviceAccessControllerV2, DeviceType, }; @@ -122,7 +122,7 @@ impl Container { Ok(IoStream { output: response.output, input: response.input, - source: IoStreamSource::Container(self.id.clone()), + id: self.id.clone(), docker: self.docker.clone(), }) } diff --git a/src/docker/iostream.rs b/src/docker/iostream.rs index 063bfa4..d07e220 100644 --- a/src/docker/iostream.rs +++ b/src/docker/iostream.rs @@ -10,15 +10,10 @@ use tokio::task::JoinHandle; use tokio_stream::{Stream, StreamExt}; use tokio_util::io::ReaderStream; -pub(super) enum IoStreamSource { - Container(String), - Exec(String), -} - pub struct IoStream { pub output: std::pin::Pin> + Send>>, pub input: Pin>, - pub(super) source: IoStreamSource, + pub(super) id: String, pub(super) docker: bollard::Docker, } @@ -30,14 +25,6 @@ enum StreamData { } impl IoStream { - pub async fn collect(mut self) -> Result { - let mut result = String::default(); - while let Some(output) = self.output.next().await { - result.push_str(&output?.to_string()); - } - Ok(result) - } - pub fn pipe_std(self) -> JoinHandle> { let stdin = crate::util::tty_mode_guard::TtyModeGuard::new(tokio::io::stdin(), |mode| { // Switch input to raw mode, but don't touch output modes (as it can also be connected @@ -72,7 +59,7 @@ impl IoStream { ) -> JoinHandle> { let mut input = self.input; let docker = self.docker; - let source = self.source; + let id = self.id; let resize_stream = resize_stream.map(|data| { let (rows, cols) = data.context("Listening for tty resize")?; @@ -101,7 +88,7 @@ impl IoStream { while let Some(data) = streams.next().await { match data? { StreamData::Resize(rows, cols) => { - resize_tty(&docker, &source, (rows, cols)).await?; + resize_tty(&docker, &id, (rows, cols)).await?; } StreamData::StdIn(mut buf) => { input.write_all_buf(&mut buf).await?; @@ -123,26 +110,11 @@ impl IoStream { } } -async fn resize_tty( - docker: &bollard::Docker, - source: &IoStreamSource, - (rows, cols): (u16, u16), -) -> Result<()> { - match source { - IoStreamSource::Container(id) => { - let options = bollard::container::ResizeContainerTtyOptions { - height: rows, - width: cols, - }; - docker.resize_container_tty(id, options).await?; - } - IoStreamSource::Exec(id) => { - let options = bollard::exec::ResizeExecOptions { - height: rows, - width: cols, - }; - docker.resize_exec(id, options).await?; - } +async fn resize_tty(docker: &bollard::Docker, id: &str, (rows, cols): (u16, u16)) -> Result<()> { + let options = bollard::container::ResizeContainerTtyOptions { + height: rows, + width: cols, }; + docker.resize_container_tty(id, options).await?; Ok(()) } diff --git a/src/docker/mod.rs b/src/docker/mod.rs index 4025f5d..32c48f5 100644 --- a/src/docker/mod.rs +++ b/src/docker/mod.rs @@ -5,5 +5,3 @@ mod iostream; pub use container::Container; pub use docker::Docker; pub use iostream::IoStream; - -use iostream::IoStreamSource;