From b7a0f0d41196f7e5da7a1ba2be41fb73cb967b08 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Sun, 21 Apr 2024 20:19:23 +0100 Subject: [PATCH] Perform I/O in mnt namespace directly instead of calling into docker The current approach requires the binaries to be available inside the container. Change to perform I/O in the container's mount namespace directly. This is more robust and less expensive. --- Cargo.toml | 2 +- src/docker/container.rs | 117 +++++++++++----------------------------- src/util/mod.rs | 1 + src/util/namespace.rs | 42 +++++++++++++++ 4 files changed, 74 insertions(+), 88 deletions(-) create mode 100644 src/util/namespace.rs diff --git a/Cargo.toml b/Cargo.toml index 31ffca3..f7eedfd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ tokio-util = { version = "0.7", features = ["full"] } async-stream = "0.3" udev = "0.8" bollard = "0.16" -rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process"] } +rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process", "thread"] } bitflags = "2" aya = { git = "https://github.com/aya-rs/aya.git" } diff --git a/src/docker/container.rs b/src/docker/container.rs index b4b0a6d..e1f3257 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -3,7 +3,7 @@ use std::pin::pin; use std::sync::Arc; use anyhow::{anyhow, ensure, Context, Error, Result}; -use rustix::fs::{Gid, Uid}; +use rustix::fs::{FileType, Gid, Mode, Uid}; use rustix::process::{Pid, Signal}; use tokio::signal::unix::{signal, SignalKind}; use tokio::sync::Mutex; @@ -104,38 +104,6 @@ impl Container { Ok(result) } - pub async fn exec_as_root(&self, cmd: &[T]) -> Result { - let cmd = cmd.iter().map(|s| s.to_string()).collect(); - let options = bollard::exec::CreateExecOptions { - cmd: Some(cmd), - attach_stdin: Some(true), - attach_stdout: Some(true), - attach_stderr: Some(true), - tty: Some(true), - detach_keys: Some("ctrl-c".to_string()), - user: Some("root".to_string()), - ..Default::default() - }; - let response = self.docker.create_exec::(&self.id, options).await?; - let id = response.id; - - let options = bollard::exec::StartExecOptions { - detach: false, - ..Default::default() - }; - let response = self.docker.start_exec(&id, Some(options)).await?; - let bollard::exec::StartExecResults::Attached { input, output } = response else { - unreachable!("we asked for attached IO streams"); - }; - - Ok(IoStream { - output, - input, - source: IoStreamSource::Exec(id), - docker: self.docker.clone(), - }) - } - pub async fn attach(&self) -> Result { let options = bollard::container::AttachContainerOptions:: { stdin: Some(true), @@ -196,67 +164,42 @@ impl Container { Ok(u8::try_from(code).unwrap_or(1)) } - pub async fn chown_to_user(&self, path: &str) -> Result<()> { - // Use `-h` to not follow symlink - self.exec_as_root(&[ - "chown", - "-h", - &format!("{}:{}", self.uid.as_raw(), self.gid.as_raw()), - path, - ]) - .await? - .collect() - .await?; - Ok(()) - } - - // Note: we use `&str` here instead of `Path` because docker API expects string instead `OsStr`. - pub async fn mkdir(&self, path: &str) -> Result<()> { - self.exec_as_root(&["mkdir", "-p", path]) - .await? - .collect() - .await?; - Ok(()) - } - - pub async fn mkdir_for(&self, path: &str) -> Result<()> { - if let Some(path) = std::path::Path::new(path).parent() { - self.mkdir(path.to_str().unwrap()).await?; - } - Ok(()) - } - pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> { - self.rm(node).await?; - let node = node.to_str().context("node is not UTF-8")?; - self.mkdir_for(node).await?; - self.exec_as_root(&["mknod", node, "c", &major.to_string(), &minor.to_string()]) - .await? - .collect() - .await?; - self.chown_to_user(node).await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + if let Some(parent) = node.parent() { + let _ = std::fs::create_dir_all(parent); + } + let _ = std::fs::remove_file(node); + rustix::fs::mknodat( + rustix::fs::CWD, + node, + FileType::CharacterDevice, + Mode::from(0o644), + rustix::fs::makedev(major, minor), + )?; + if !self.uid.is_root() { + rustix::fs::chown(node, Some(self.uid), Some(self.gid))?; + } + Ok(()) + })? } pub async fn symlink(&self, source: &Path, link: &Path) -> Result<()> { - let source = source.to_str().context("node is not UTF-8")?; - let link = link.to_str().context("symlink is not UTF-8")?; - self.mkdir_for(link).await?; - self.exec_as_root(&["ln", "-sf", source, link]) - .await? - .collect() - .await?; - self.chown_to_user(link).await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + if let Some(parent) = link.parent() { + let _ = std::fs::create_dir_all(parent); + } + let _ = std::fs::remove_file(link); + std::os::unix::fs::symlink(source, link)?; + // No need to chown symlink. Permission is determined by the target. + Ok(()) + })? } pub async fn rm(&self, node: &Path) -> Result<()> { - let node = node.to_str().context("node is not UTF-8")?; - self.exec_as_root(&["rm", "-f", node]) - .await? - .collect() - .await?; - Ok(()) + crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + let _ = std::fs::remove_file(node); + }) } pub async fn device(&self, (major, minor): (u32, u32), access: Access) -> Result<()> { diff --git a/src/util/mod.rs b/src/util/mod.rs index 53a58f8..5d22c3e 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,2 +1,3 @@ pub mod escape; +pub mod namespace; pub mod tty_mode_guard; diff --git a/src/util/namespace.rs b/src/util/namespace.rs new file mode 100644 index 0000000..ab26164 --- /dev/null +++ b/src/util/namespace.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::os::fd::AsFd; + +use anyhow::Result; +use rustix::process::Pid; +use rustix::thread::{LinkNameSpaceType, UnshareFlags}; + +pub struct MntNamespace { + fd: File, +} + +impl MntNamespace { + /// Open the mount namespace of a process. + pub fn of_pid(pid: Pid) -> Result { + let path = format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()); + let fd = File::open(path)?; + Ok(MntNamespace { fd }) + } + + /// Enter the mount namespace. + pub fn enter T + Send>(&self, f: F) -> Result { + // To avoid messing with rest of the process, we do everything in a new thread. + // Use scoped thread to avoid 'static bound (we need to access fd). + std::thread::scope(|scope| { + scope + .spawn(|| -> Result { + // Unshare FS for this specific thread so we can switch to another namespace. + // Not doing this will cause EINVAL when switching to namespaces. + rustix::thread::unshare(UnshareFlags::FS)?; + + // Switch this particular thread to the container's mount namespace. + rustix::thread::move_into_link_name_space( + self.fd.as_fd(), + Some(LinkNameSpaceType::Mount), + )?; + Ok(f()) + }) + .join() + .map_err(|_| anyhow::anyhow!("work thread panicked"))? + }) + } +}