Skip to content

Commit

Permalink
Perform I/O in mnt namespace directly instead of calling into docker
Browse files Browse the repository at this point in the history
The current approach requires the binaries to be available inside the
container. Change to perform I/O in the container's mount namespace
directly. This is more robust and less expensive.
  • Loading branch information
nbdd0121 committed Apr 21, 2024
1 parent 55c88ef commit 568fd2e
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 88 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ async-stream = "0.3"
udev = "0.8"
bollard = "0.16"
futures = "0.3"
rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process"] }
rustix = { version = "0.38", features = ["fs", "stdio", "termios", "process", "thread"] }
bitflags = "2"
aya = { git = "https://github.com/aya-rs/aya.git" }

Expand Down
117 changes: 30 additions & 87 deletions src/docker/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use anyhow::{anyhow, ensure, Context, Error, Result};
use bollard::service::EventMessage;
use futures::future::{BoxFuture, Shared};
use futures::FutureExt;
use rustix::fs::{Gid, Uid};
use rustix::fs::{FileType, Gid, Mode, Uid};
use rustix::process::{Pid, Signal};
use tokio::signal::unix::{signal, SignalKind};
use tokio::sync::Mutex;
Expand Down Expand Up @@ -174,38 +174,6 @@ impl Container {
Ok(result)
}

pub async fn exec_as_root<T: ToString>(&self, cmd: &[T]) -> Result<IoStream> {
let cmd = cmd.iter().map(|s| s.to_string()).collect();
let options = bollard::exec::CreateExecOptions {
cmd: Some(cmd),
attach_stdin: Some(true),
attach_stdout: Some(true),
attach_stderr: Some(true),
tty: Some(true),
detach_keys: Some("ctrl-c".to_string()),
user: Some("root".to_string()),
..Default::default()
};
let response = self.docker.create_exec::<String>(&self.id, options).await?;
let id = response.id;

let options = bollard::exec::StartExecOptions {
detach: false,
..Default::default()
};
let response = self.docker.start_exec(&id, Some(options)).await?;
let bollard::exec::StartExecResults::Attached { input, output } = response else {
unreachable!("we asked for attached IO streams");
};

Ok(IoStream {
output,
input,
source: IoStreamSource::Exec(id),
docker: self.docker.clone(),
})
}

pub async fn attach(&self) -> Result<IoStream> {
let options = bollard::container::AttachContainerOptions::<String> {
stdin: Some(true),
Expand Down Expand Up @@ -264,67 +232,42 @@ impl Container {
}
}

pub async fn chown_to_user(&self, path: &str) -> Result<()> {
// Use `-h` to not follow symlink
self.exec_as_root(&[
"chown",
"-h",
&format!("{}:{}", self.uid.as_raw(), self.gid.as_raw()),
path,
])
.await?
.collect()
.await?;
Ok(())
}

// Note: we use `&str` here instead of `Path` because docker API expects string instead `OsStr`.
pub async fn mkdir(&self, path: &str) -> Result<()> {
self.exec_as_root(&["mkdir", "-p", path])
.await?
.collect()
.await?;
Ok(())
}

pub async fn mkdir_for(&self, path: &str) -> Result<()> {
if let Some(path) = std::path::Path::new(path).parent() {
self.mkdir(path.to_str().unwrap()).await?;
}
Ok(())
}

pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> {
self.rm(node).await?;
let node = node.to_str().context("node is not UTF-8")?;
self.mkdir_for(node).await?;
self.exec_as_root(&["mknod", node, "c", &major.to_string(), &minor.to_string()])
.await?
.collect()
.await?;
self.chown_to_user(node).await?;
Ok(())
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
if let Some(parent) = node.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::remove_file(node);
rustix::fs::mknodat(
rustix::fs::CWD,
node,
FileType::CharacterDevice,
Mode::from(0o644),
rustix::fs::makedev(major, minor),
)?;
if !self.uid.is_root() {
rustix::fs::chown(node, Some(self.uid), Some(self.gid))?;
}
Ok(())
})?
}

pub async fn symlink(&self, source: &Path, link: &Path) -> Result<()> {
let source = source.to_str().context("node is not UTF-8")?;
let link = link.to_str().context("symlink is not UTF-8")?;
self.mkdir_for(link).await?;
self.exec_as_root(&["ln", "-sf", source, link])
.await?
.collect()
.await?;
self.chown_to_user(link).await?;
Ok(())
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
if let Some(parent) = link.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::remove_file(link);
std::os::unix::fs::symlink(source, link)?;
// No need to chown symlink. Permission is determined by the target.
Ok(())
})?
}

pub async fn rm(&self, node: &Path) -> Result<()> {
let node = node.to_str().context("node is not UTF-8")?;
self.exec_as_root(&["rm", "-f", node])
.await?
.collect()
.await?;
Ok(())
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
let _ = std::fs::remove_file(node);
})
}

pub async fn device(&self, (major, minor): (u32, u32), access: Access) -> Result<()> {
Expand Down
1 change: 1 addition & 0 deletions src/util/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod escape;
pub mod namespace;
pub mod tty_mode_guard;
42 changes: 42 additions & 0 deletions src/util/namespace.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use std::fs::File;
use std::os::fd::AsFd;

use anyhow::Result;
use rustix::process::Pid;
use rustix::thread::{LinkNameSpaceType, UnshareFlags};

pub struct MntNamespace {
fd: File,
}

impl MntNamespace {
/// Open the mount namespace of a process.
pub fn of_pid(pid: Pid) -> Result<MntNamespace> {
let path = format!("/proc/{}/ns/mnt", pid.as_raw_nonzero());
let fd = File::open(path)?;
Ok(MntNamespace { fd })
}

/// Enter the mount namespace.
pub fn enter<T: Send, F: FnOnce() -> T + Send>(&self, f: F) -> Result<T> {
// To avoid messing with rest of the process, we do everything in a new thread.
// Use scoped thread to avoid 'static bound (we need to access fd).
std::thread::scope(|scope| {
scope
.spawn(|| -> Result<T> {
// Unshare FS for this specific thread so we can switch to another namespace.
// Not doing this will cause EINVAL when switching to namespaces.
rustix::thread::unshare(UnshareFlags::FS)?;

// Switch this particular thread to the container's mount namespace.
rustix::thread::move_into_link_name_space(
self.fd.as_fd(),
Some(LinkNameSpaceType::Mount),
)?;
Ok(f())
})
.join()
.map_err(|_| anyhow::anyhow!("work thread panicked"))?
})
}
}

0 comments on commit 568fd2e

Please sign in to comment.