diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 929399a..7d59dbe 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -28,8 +28,8 @@ jobs: - name: Clippy run: cargo clippy -- -D warnings - test: - name: Test + test-integration: + name: Integration Test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -51,5 +51,31 @@ jobs: - name: Compile tests run: cargo nextest run --verbose --no-run - - name: Run tests - run: cargo nextest run --no-fail-fast + - name: Run integration tests + run: cargo nextest run --no-fail-fast -E 'binary(integration)' + + test-unit: + name: Unit Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Install cargo-nextest + uses: taiki-e/install-action@nextest + + - name: Compile tests + run: cargo nextest run --verbose --no-run + + - name: Run unit tests + run: cargo nextest run --no-fail-fast -E 'not binary(integration)' diff --git a/src/main.rs b/src/main.rs index f457f9c..509885f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -88,8 +88,17 @@ enum Command { /// - from-first-glob | g: the key path relative to the first path part /// containing a glob in the pattern will be reproduced in the /// destination + /// - shortest | s: the shortest path that can be made without conflicts. + /// This strips the longest common directory prefix from the key path. #[clap(short, long, verbatim_doc_comment, default_value = "from-first-glob")] - path_mode: PathType, + path_mode: PathMode, + + /// Flatten the downloaded files into a single directory + /// + /// This will replace all slashes in the key path with dashes in the + /// downloaded file. + #[clap(long)] + flatten: bool, }, /// Learn how to tune s3glob's parallelism for better performance @@ -134,37 +143,44 @@ enum Command { } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum PathType { +enum PathMode { Abs, Absolute, G, FromFirstGlob, + S, + Shortest, } -impl ValueEnum for PathType { +impl ValueEnum for PathMode { fn value_variants<'a>() -> &'a [Self] { &[ - PathType::Absolute, - PathType::Abs, - PathType::FromFirstGlob, - PathType::G, + PathMode::Absolute, + PathMode::Abs, + PathMode::FromFirstGlob, + PathMode::G, + PathMode::S, + PathMode::Shortest, ] } fn from_str(s: &str, _ignore_case: bool) -> Result { match s { - "absolute" | "abs" => Ok(PathType::Absolute), - "from-first-glob" | "g" => Ok(PathType::FromFirstGlob), + "absolute" | "abs" => Ok(PathMode::Absolute), + "from-first-glob" | "g" => Ok(PathMode::FromFirstGlob), + "shortest" | "s" => Ok(PathMode::Shortest), _ => Err(format!("invalid path type: {}", s)), } } fn to_possible_value(&self) -> Option { match self { - PathType::Abs => Some(clap::builder::PossibleValue::new("abs")), - PathType::Absolute => Some(clap::builder::PossibleValue::new("absolute")), - PathType::FromFirstGlob => Some(clap::builder::PossibleValue::new("from-first-glob")), - PathType::G => Some(clap::builder::PossibleValue::new("g")), + PathMode::Abs => Some(clap::builder::PossibleValue::new("abs")), + PathMode::Absolute => Some(clap::builder::PossibleValue::new("absolute")), + PathMode::FromFirstGlob => Some(clap::builder::PossibleValue::new("from-first-glob")), + PathMode::G => Some(clap::builder::PossibleValue::new("g")), + PathMode::Shortest => Some(clap::builder::PossibleValue::new("shortest")), + PathMode::S => Some(clap::builder::PossibleValue::new("s")), } } } @@ -383,14 +399,26 @@ async fn run(opts: Opts) -> Result<()> { ); } Command::Download { - dest, path_mode, .. + dest, + path_mode, + flatten, + .. } => { let mut total_matches = 0; let pools = DlPools::new(); - let prefix_to_strip = extract_prefix_to_strip(&raw_pattern, path_mode); + let prefix_to_strip = extract_prefix_to_strip(&raw_pattern, path_mode, &[]); let (ntfctn_tx, mut ntfctn_rx) = tokio::sync::mpsc::unbounded_channel::(); let base_path = PathBuf::from(dest); - let dl = Downloader::new(client, bucket, prefix_to_strip, base_path, ntfctn_tx); + let dl = Downloader::new( + client.clone(), + bucket.clone(), + prefix_to_strip, + flatten, + base_path.clone(), + ntfctn_tx.clone(), + ); + // if the path_mode is shortes then we need to know all the paths to be able to extract the shortest + let mut objects_to_download = Vec::new(); while let Some(result) = rx.recv().await { total_matches += result .iter() @@ -399,7 +427,11 @@ async fn run(opts: Opts) -> Result<()> { for obj in result { match obj { PrefixResult::Object(obj) => { - pools.download_object(dl.fresh(), obj); + if matches!(path_mode, PathMode::Shortest | PathMode::S) { + objects_to_download.push(obj); + } else { + pools.download_object(dl.fresh(), obj); + } } PrefixResult::Prefix(prefix) => { debug!("Skipping prefix: {}", prefix); @@ -417,9 +449,32 @@ async fn run(opts: Opts) -> Result<()> { total_prefixes ); } + debug!("closing downloader tx"); // close the tx so the downloaders know to finish - drop(pools); drop(dl); + drop(pools); + if matches!(path_mode, PathMode::Shortest | PathMode::S) { + let prefix_to_strip = + extract_prefix_to_strip(&raw_pattern, path_mode, &objects_to_download); + eprintln!( + "Stripping longest common prefix from keys: {}", + prefix_to_strip + ); + let dl = Downloader::new( + client, + bucket, + prefix_to_strip, + flatten, + base_path, + ntfctn_tx, + ); + let pools = DlPools::new(); + for obj in objects_to_download { + pools.download_object(dl.fresh(), obj); + } + } else { + drop(ntfctn_tx); + } eprintln!(); let start_time = Instant::now(); let mut downloaded_matches = 0; @@ -489,7 +544,8 @@ fn print_prefix_result( } } PrefixResult::Prefix(prefix) => { - println!(" {prefix}"); + // TODO: support user-customizable prefix formatting too? + println!("PRE {prefix}"); } } } @@ -575,6 +631,7 @@ struct Downloader { client: Client, bucket: String, prefix_to_strip: String, + flatten: bool, base_path: PathBuf, obj_counter: Arc, obj_id: usize, @@ -592,6 +649,7 @@ impl Downloader { client: Client, bucket: String, prefix_to_strip: String, + flatten: bool, base_path: PathBuf, notifier: UnboundedSender, ) -> Self { @@ -602,6 +660,7 @@ impl Downloader { obj_id: 0, notifier, base_path, + flatten, prefix_to_strip, } } @@ -616,15 +675,20 @@ impl Downloader { obj_id, notifier: self.notifier.clone(), prefix_to_strip: self.prefix_to_strip.clone(), + flatten: self.flatten, base_path: self.base_path.clone(), } } async fn download_object(self, obj: S3Object) { let key = &obj.key; - let key_suffix = key + let mut key_suffix = key .strip_prefix(&self.prefix_to_strip) - .expect("all found objects will include the prefix"); + .expect("all found objects will include the prefix") + .to_string(); + if self.flatten { + key_suffix = key_suffix.replace(std::path::MAIN_SEPARATOR_STR, "-"); + } let path = self.base_path.join(key_suffix); let dir = path.parent().unwrap(); if let Err(e) = std::fs::create_dir_all(dir) { @@ -691,10 +755,10 @@ impl Downloader { } } -fn extract_prefix_to_strip(raw_pattern: &str, path_mode: PathType) -> String { +fn extract_prefix_to_strip(raw_pattern: &str, path_mode: PathMode, keys: &[S3Object]) -> String { match path_mode { - PathType::Abs | PathType::Absolute => String::new(), - PathType::FromFirstGlob | PathType::G => { + PathMode::Abs | PathMode::Absolute => String::new(), + PathMode::FromFirstGlob | PathMode::G => { let up_to_glob: String = raw_pattern .chars() .take_while(|c| !GLOB_CHARS.contains(c)) @@ -705,6 +769,24 @@ fn extract_prefix_to_strip(raw_pattern: &str, path_mode: PathType) -> String { None => up_to_glob, } } + PathMode::S | PathMode::Shortest => { + let Some(prefix) = keys.first() else { + return String::new(); + }; + let mut prefix = prefix.key.to_string(); + for key_obj in &keys[1..] { + prefix = prefix + .chars() + .zip(key_obj.key.chars()) + .take_while(|(a, b)| a == b) + .map(|(a, _)| a) + .collect(); + } + // get the prefix up to and including the last slash + let suffix = prefix.chars().rev().take_while(|c| *c != '/').count(); + prefix.truncate(prefix.len() - suffix); + prefix + } } } @@ -967,7 +1049,7 @@ mod tests { macro_rules! assert_extract_prefix_to_strip { ($pattern:expr, $path_mode:expr, $expected:expr) => { - let actual = extract_prefix_to_strip($pattern, $path_mode); + let actual = extract_prefix_to_strip($pattern, $path_mode, &[]); assert2::check!( actual == $expected, "input: {} path_mode: {:?}", @@ -975,43 +1057,143 @@ mod tests { $path_mode, ); }; + ($pattern:expr, $path_mode:expr, $expected:expr, $keys:expr) => { + let keys: &[S3Object] = $keys; + let actual = extract_prefix_to_strip($pattern, $path_mode, keys); + assert2::check!( + actual == $expected, + "input: {} path_mode: {:?} keys: {:?}", + $pattern, + $path_mode, + keys, + ); + }; } #[test] fn test_extract_prefix_to_strip() { // Test absolute path mode - assert_extract_prefix_to_strip!("prefix/path/to/*.txt", PathType::Absolute, ""); - assert_extract_prefix_to_strip!("bucket/deep/path/*.txt", PathType::Abs, ""); + assert_extract_prefix_to_strip!("prefix/path/to/*.txt", PathMode::Absolute, ""); + assert_extract_prefix_to_strip!("bucket/deep/path/*.txt", PathMode::Abs, ""); // Test from-first-glob path mode assert_extract_prefix_to_strip!( "prefix/path/to/*.txt", - PathType::FromFirstGlob, + PathMode::FromFirstGlob, "prefix/path/to/" ); assert_extract_prefix_to_strip!( "prefix/path/*/more/*.txt", - PathType::FromFirstGlob, + PathMode::FromFirstGlob, "prefix/path/" ); - assert_extract_prefix_to_strip!("prefix/*.txt", PathType::FromFirstGlob, "prefix/"); - assert_extract_prefix_to_strip!("*.txt", PathType::FromFirstGlob, ""); - assert_extract_prefix_to_strip!("prefix/a.txt", PathType::FromFirstGlob, "prefix/"); + assert_extract_prefix_to_strip!("prefix/*.txt", PathMode::FromFirstGlob, "prefix/"); + assert_extract_prefix_to_strip!("*.txt", PathMode::FromFirstGlob, ""); + assert_extract_prefix_to_strip!("prefix/a.txt", PathMode::FromFirstGlob, "prefix/"); // Test with different glob characters assert_extract_prefix_to_strip!( "prefix/path/to/[abc]/*.txt", - PathType::FromFirstGlob, + PathMode::FromFirstGlob, "prefix/path/to/" ); assert_extract_prefix_to_strip!( "prefix/path/to/?/*.txt", - PathType::FromFirstGlob, + PathMode::FromFirstGlob, "prefix/path/to/" ); assert_extract_prefix_to_strip!( "prefix/path/{a,b}/*.txt", - PathType::FromFirstGlob, + PathMode::FromFirstGlob, "prefix/path/" ); } + + #[test] + fn test_extract_prefix_to_strip_shortest() { + // Helper function to create S3Objects for testing + fn make_objects(keys: &[&str]) -> Vec { + keys.iter() + .map(|&key| S3Object { + key: key.to_string(), + size: 0, + last_modified: DateTime::from_millis(0), + }) + .collect() + } + + // Different prefixes entirely - no common prefix + assert_extract_prefix_to_strip!( + "different/*/file*.txt", + PathMode::Shortest, + "", + &make_objects(&["different/path/file1.txt", "alternate/path/file2.txt",]) + ); + + // Partial prefix overlap + assert_extract_prefix_to_strip!( + "shared-prefix/*/data/*.txt", + PathMode::Shortest, + "", + &make_objects(&[ + "shared-prefix/abc/data/file1.txt", + "shared-prefix-extra/xyz/data/file2.txt", + ]) + ); + + // One path is a prefix of another + assert_extract_prefix_to_strip!( + "deep/nested/*/file*.txt", + PathMode::Shortest, + "deep/nested/path/", + &make_objects(&[ + "deep/nested/path/file1.txt", + "deep/nested/path/more/file2.txt", + ]) + ); + + // Empty prefix case - files in root + assert_extract_prefix_to_strip!( + "*.txt", + PathMode::Shortest, + "", + &make_objects(&["file1.txt", "file2.txt",]) + ); + + // Original test cases + assert_extract_prefix_to_strip!( + "prefix/2024-*/file*.txt", + PathMode::Shortest, + "prefix/", + &make_objects(&[ + "prefix/2024-01/file1.txt", + "prefix/2024-01/file2.txt", + "prefix/2024-02/file2.txt", + ]) + ); + + assert_extract_prefix_to_strip!( + "prefix/nested/*/file*.txt", + PathMode::Shortest, + "prefix/nested/", + &make_objects(&["prefix/nested/a/file1.txt", "prefix/nested/b/file2.txt",]) + ); + + assert_extract_prefix_to_strip!( + "prefix/*/nested/*.txt", + PathMode::Shortest, + "prefix/a/nested/", + &make_objects(&["prefix/a/nested/file1.txt", "prefix/a/nested/file2.txt",]) + ); + + // Edge case: empty keys list + assert_extract_prefix_to_strip!("any/pattern/*.txt", PathMode::Shortest, "", &[]); + + // Edge case: single key + assert_extract_prefix_to_strip!( + "single/path/*.txt", + PathMode::Shortest, + "single/path/", + &make_objects(&["single/path/file.txt"]) + ); + } } diff --git a/tests/integration.rs b/tests/integration.rs index 10eed03..27496d8 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -251,6 +251,165 @@ async fn test_patterns_in_file_not_path_component( Ok(()) } +#[rstest] +#[case( // 1 Keep different subdirs + &[ + "prefix/2024-01/file1.txt", + "prefix/2024-01/file2.txt", + "prefix/2024-02/file2.txt", + ], + &[ + "2024-01/file1.txt", + "2024-01/file2.txt", + "2024-02/file2.txt", + ] +)] +#[case( // 2 Slightly deeper nesting + &[ + "prefix/nested/a/file1.txt", + "prefix/nested/b/file2.txt", + ], + &[ + "a/file1.txt", + "b/file2.txt", + ] +)] +#[case( // 3 Strip nested prefix + &[ + "prefix/a/nested/file1.txt", + "prefix/a/nested/file2.txt", + ], + &[ + "file1.txt", + "file2.txt", + ] +)] +#[case( // 4 Empty prefix case - when files are in root of bucket + &[ + "file1.txt", + "file2.txt", + ], + &[ + "file1.txt", + "file2.txt", + ] +)] +#[case( // 5 Different prefixes entirely - shortest should find no common prefix + &[ + "different/path/file1.txt", + "alternate/path/file2.txt", + ], + &[ + "different/path/file1.txt", + "alternate/path/file2.txt", + ] +)] +#[case( // 6 Partial prefix overlap - shortest should break on path boundaries + &[ + "shared-prefix/abc/data/file1.txt", + "shared-prefix-extra/xyz/data/file2.txt", + ], + &[ + "shared-prefix/abc/data/file1.txt", + "shared-prefix-extra/xyz/data/file2.txt", + ] +)] +#[case( // 7 One path is a prefix of another - shortest should preserve uniqueness + + &[ + "deep/nested/path/file1.txt", + "deep/nested/path/more/file2.txt", + ], + &[ + "file1.txt", + "more/file2.txt", + ] +)] +#[tokio::test] +async fn test_download_prefix_shortest( + #[case] source_files: &[&str], + #[case] expected_paths: &[&str], +) -> anyhow::Result<()> { + let glob = "**"; + let (_node, port, client) = minio_and_client().await; + + let bucket = "test-bucket"; + client.create_bucket().bucket(bucket).send().await?; + + for key in source_files { + create_object(&client, bucket, key).await?; + } + + let tempdir = TempDir::new()?; + + let mut cmd = run_s3glob( + port, + &[ + "dl", + "-p", + "shortest", + format!("s3://{}/{}", bucket, glob).as_str(), + tempdir.path().to_str().unwrap(), + ], + )?; + + let _ = cmd.assert().success(); + + for path in expected_paths { + tempdir.child(path).assert(predicate::path::exists()); + } + + Ok(()) +} + +#[tokio::test] +async fn test_download_flatten() -> anyhow::Result<()> { + let (_node, port, client) = minio_and_client().await; + + let bucket = "test-bucket"; + client.create_bucket().bucket(bucket).send().await?; + + // Create some nested test files + let source_files = [ + "prefix/nested/deep/file1.txt", + "prefix/other/path/file2.txt", + "prefix/file3.txt", + ]; + for key in &source_files { + create_object(&client, bucket, key).await?; + } + + let tempdir = TempDir::new()?; + + // Run s3glob with flatten flag + let mut cmd = run_s3glob( + port, + &[ + "dl", + "--flatten", + format!("s3://{}/prefix/**/*.txt", bucket).as_str(), + tempdir.path().to_str().unwrap(), + ], + )?; + + let _ = cmd.assert().success(); + + // Expected flattened filenames + let expected_files = ["nested-deep-file1.txt", "other-path-file2.txt", "file3.txt"]; + + // Verify that files exist with flattened names + for expected in &expected_files { + tempdir.child(expected).assert(predicate::path::exists()); + } + + // Verify original paths don't exist + for source in &source_files { + tempdir.child(source).assert(predicate::path::missing()); + } + + Ok(()) +} + // // Helpers // @@ -323,7 +482,7 @@ fn run_s3glob(port: u16, args: &[&str]) -> anyhow::Result { fn print_s3glob_output(cmd: &mut Command) { let output = cmd.output().unwrap(); println!( - "s3glob output:\n{}\n{}", + "==== s3glob stdout ====\n{}\n==== s3glob stderr ====\n{}\n==== end s3glob output ====\n", String::from_utf8(output.stdout).unwrap(), String::from_utf8(output.stderr).unwrap() );