From 2075df99cc3fba08a888332b519e8b03bfe469c0 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 21 Nov 2023 15:57:22 +0100 Subject: [PATCH] PVF: Fix unshare "no such file or directory" error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In https://github.com/paritytech/polkadot-sdk/pull/2304 we introduced this bug where unshare will always fail, leading to a scary error message for validators. Not sure how this wasn't caught in testing, neither for me nor the user who reported #2304. 🙈 Closes #2320 --- polkadot/node/core/pvf/src/artifacts.rs | 5 +- .../node/core/pvf/src/execute/worker_intf.rs | 4 +- .../node/core/pvf/src/prepare/worker_intf.rs | 4 +- polkadot/node/core/pvf/src/security.rs | 17 ++- polkadot/node/core/pvf/src/worker_intf.rs | 105 ++++++++---------- 5 files changed, 60 insertions(+), 75 deletions(-) diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs index 53085eade3cb2..79b53467b4e33 100644 --- a/polkadot/node/core/pvf/src/artifacts.rs +++ b/polkadot/node/core/pvf/src/artifacts.rs @@ -499,8 +499,7 @@ mod tests { #[tokio::test] async fn remove_stale_cache_on_startup() { - let cache_dir = crate::worker_intf::tmppath("test-cache").await.unwrap(); - fs::create_dir_all(&cache_dir).unwrap(); + let cache_dir = tempfile::Builder::new().prefix("test-cache-").tempdir().unwrap(); // invalid prefix create_rand_artifact(&cache_dir, ""); @@ -529,7 +528,7 @@ mod tests { assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 7); - let artifacts = Artifacts::new_and_prune(&cache_dir).await; + let artifacts = Artifacts::new_and_prune(cache_dir.path()).await; assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 1); assert_eq!(artifacts.len(), 1); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 4faaf13e62f74..fc6cd8fc72565 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -278,7 +278,7 @@ where // Cheaply create a hard link to the artifact. The artifact is always at a known location in the // worker cache, and the child can't access any other artifacts or gain any information from the // original filename. - let link_path = worker_dir::execute_artifact(&worker_dir.path); + let link_path = worker_dir::execute_artifact(worker_dir.path()); if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await { gum::warn!( target: LOG_TARGET, @@ -292,7 +292,7 @@ where } } - let worker_dir_path = worker_dir.path.clone(); + let worker_dir_path = worker_dir.path().to_owned(); let outcome = f(worker_dir).await; // Try to clear the worker dir. diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 318aea7295ded..f978005068c8e 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -318,7 +318,7 @@ where { // Create the tmp file here so that the child doesn't need any file creation rights. This will // be cleared at the end of this function. - let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path); + let tmp_file = worker_dir::prepare_tmp_artifact(worker_dir.path()); if let Err(err) = tokio::fs::File::create(&tmp_file).await { gum::warn!( target: LOG_TARGET, @@ -333,7 +333,7 @@ where } }; - let worker_dir_path = worker_dir.path.clone(); + let worker_dir_path = worker_dir.path().to_owned(); let outcome = f(tmp_file, stream, worker_dir).await; // Try to clear the worker dir. diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index 8c06c68392f48..8ae09158fe7a7 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -158,18 +158,15 @@ async fn check_can_unshare_user_namespace_and_change_root( ) -> SecureModeResult { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - let cache_dir_tempdir = - crate::worker_intf::tmppath_in("check-can-unshare", cache_path) - .await - .map_err( - |err| - SecureModeError::CannotUnshareUserNamespaceAndChangeRoot( - format!("could not create a temporary directory in {:?}: {}", cache_path, err) - ) - )?; + let cache_dir_tempdir = tempfile::Builder::new() + .prefix("check-can-unshare-") + .tempdir_in(cache_path) + .map_err(|err| SecureModeError::CannotUnshareUserNamespaceAndChangeRoot( + format!("could not create a temporary directory in {:?}: {}", cache_path, err) + ))?; match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") - .arg(cache_dir_tempdir) + .arg(cache_dir_tempdir.path()) .output() .await { diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 5e589b9abcee8..9d6907c10929f 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -71,6 +71,7 @@ pub async fn spawn_with_program_path( with_transient_socket_path(debug_id, |socket_path| { let socket_path = socket_path.to_owned(); + let worker_dir_path = worker_dir.path().to_owned(); async move { let listener = UnixListener::bind(&socket_path).map_err(|err| { @@ -91,7 +92,7 @@ pub async fn spawn_with_program_path( &program_path, &extra_args, &socket_path, - &worker_dir.path, + &worker_dir_path, security_status, ) .map_err(|err| { @@ -100,7 +101,7 @@ pub async fn spawn_with_program_path( %debug_id, ?program_path, ?extra_args, - ?worker_dir.path, + ?worker_dir_path, ?socket_path, "cannot spawn a worker: {:?}", err, @@ -108,7 +109,6 @@ pub async fn spawn_with_program_path( SpawnErr::ProcessSpawn })?; - let worker_dir_path = worker_dir.path.clone(); futures::select! { accept_result = listener.accept().fuse() => { let (stream, _) = accept_result.map_err(|err| { @@ -150,7 +150,42 @@ where F: FnOnce(&Path) -> Fut, Fut: futures::Future> + 'static, { - let socket_path = tmppath(&format!("pvf-host-{}", debug_id)) + /// Returns a path under [`std::env::temp_dir`]. The path name will start with the given prefix. + /// + /// There is only a certain number of retries. If exceeded this function will give up and return + /// an error. + pub async fn tmppath(prefix: &str) -> io::Result { + fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf { + use rand::distributions::Alphanumeric; + + const DESCRIMINATOR_LEN: usize = 10; + + let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN); + buf.extend(prefix.as_bytes()); + buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN)); + + let s = std::str::from_utf8(&buf) + .expect("the string is collected from a valid utf-8 sequence; qed"); + + let mut path = dir.to_owned(); + path.push(s); + path + } + + const NUM_RETRIES: usize = 50; + + let dir = std::env::temp_dir(); + for _ in 0..NUM_RETRIES { + let tmp_path = make_tmppath(prefix, &dir); + if !tmp_path.exists() { + return Ok(tmp_path) + } + } + + Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path")) + } + + let socket_path = tmppath(&format!("pvf-host-{}-", debug_id)) .await .map_err(|_| SpawnErr::TmpPath)?; let result = f(&socket_path).await; @@ -162,46 +197,6 @@ where result } -/// Returns a path under the given `dir`. The path name will start with the given prefix. -/// -/// There is only a certain number of retries. If exceeded this function will give up and return an -/// error. -pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result { - fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf { - use rand::distributions::Alphanumeric; - - const DESCRIMINATOR_LEN: usize = 10; - - let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN); - buf.extend(prefix.as_bytes()); - buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN)); - - let s = std::str::from_utf8(&buf) - .expect("the string is collected from a valid utf-8 sequence; qed"); - - let mut path = dir.to_owned(); - path.push(s); - path - } - - const NUM_RETRIES: usize = 50; - - for _ in 0..NUM_RETRIES { - let tmp_path = make_tmppath(prefix, dir); - if !tmp_path.exists() { - return Ok(tmp_path) - } - } - - Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path")) -} - -/// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory. -pub async fn tmppath(prefix: &str) -> io::Result { - let temp_dir = std::env::temp_dir(); - tmppath_in(prefix, &temp_dir).await -} - /// A struct that represents an idle worker. /// /// This struct is supposed to be used as a token that is passed by move into a subroutine that @@ -224,8 +219,6 @@ pub struct IdleWorker { pub enum SpawnErr { /// Cannot obtain a temporary path location. TmpPath, - /// An FS error occurred. - Fs(String), /// Cannot bind the socket to the given path. Bind, /// An error happened during accepting a connection to the socket. @@ -419,26 +412,22 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result /// ``` #[derive(Debug)] pub struct WorkerDir { - pub path: PathBuf, + tempdir: tempfile::TempDir, } impl WorkerDir { /// Creates a new, empty worker dir with a random name in the given cache dir. pub async fn new(debug_id: &'static str, cache_dir: &Path) -> Result { let prefix = format!("worker-dir-{}-", debug_id); - let path = tmppath_in(&prefix, cache_dir).await.map_err(|_| SpawnErr::TmpPath)?; - tokio::fs::create_dir(&path) - .await - .map_err(|err| SpawnErr::Fs(err.to_string()))?; - Ok(Self { path }) + let tempdir = tempfile::Builder::new() + .prefix(&prefix) + .tempdir_in(cache_dir) + .map_err(|_| SpawnErr::TmpPath)?; + Ok(Self { tempdir }) } -} -// Try to clean up the temporary worker dir at the end of the worker's lifetime. It should be wiped -// on startup, but we make a best effort not to leave it around. -impl Drop for WorkerDir { - fn drop(&mut self) { - let _ = std::fs::remove_dir_all(&self.path); + pub fn path(&self) -> &Path { + self.tempdir.path() } }