From 59ddd855825364114579a2f225d29e7120745a9c Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 7 Dec 2024 08:51:24 +0000 Subject: [PATCH 1/6] fix: handle unrecognized log file patterns - Modified list_log_files to properly handle unrecognized file types - Added test coverage for unusual file patterns: - Hex decimal commit files - Bogus part numbering in checkpoints - V2 checkpoint files with UUIDs - Compacted log files - CRC files The changes ensure that unrecognized files are silently ignored while maintaining proper processing of valid log files. --- kernel/src/log_segment.rs | 2 +- kernel/src/log_segment/tests.rs | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index df8f1813b..6a0292775 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -289,7 +289,7 @@ fn list_log_files( Ok(fs_client .list_from(&start_from)? - .map(|meta| ParsedLogPath::try_from(meta?)) + .map(|meta| meta.map(|m| ParsedLogPath::try_from(m).ok().and_then(identity))) // TODO this filters out .crc files etc which start with "." - how do we want to use these kind of files? .filter_map_ok(identity) .take_while(move |path_res| match path_res { diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index ed029b006..e4ca7ec9f 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -511,3 +511,44 @@ fn table_changes_fails_with_larger_start_version_than_end() { let log_segment_res = LogSegment::for_table_changes(client.as_ref(), log_root, 1, Some(0)); assert!(log_segment_res.is_err()); } + +#[test] +fn test_list_files_with_unusual_patterns() { + // Create paths for all the unusual patterns + let paths = vec![ + // hex instead of decimal + Path::from("_delta_log/00000000deadbeef.commit.json"), + // bogus part numbering + Path::from("_delta_log/00000000000000000000.checkpoint.0000000010.0000000000.parquet"), + // v2 checkpoint + Path::from("_delta_log/00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json"), + // compacted log file + Path::from("_delta_log/00000000000000000004.00000000000000000006.compacted.json"), + // CRC file + Path::from("_delta_log/00000000000000000001.crc"), + // Valid commit file for comparison + Path::from("_delta_log/00000000000000000002.json"), + ]; + + let (client, log_root) = build_log_with_paths_and_checkpoint(&paths, None); + + // Test that list_log_files doesn't fail + let result = super::list_log_files(&*client, &log_root, None, None); + assert!(result.is_ok(), "list_log_files should not fail"); + + // Collect the results and verify + let paths: Vec<_> = result.unwrap().collect::, _>>().unwrap(); + + // We should find at least the valid commit file + assert!(!paths.is_empty(), "Should find at least one valid file"); + + // Verify that the valid commit file is included + let has_valid_commit = paths.iter().any(|p| p.version == 2 && p.is_commit()); + assert!(has_valid_commit, "Should find the valid commit file"); + + // Verify that none of the paths resulted in errors + // The invalid patterns should have been filtered out by ParsedLogPath::try_from + for path in paths { + assert!(!path.is_unknown(), "Found unexpected unknown file type"); + } +} From 6fef42a090a3164ecd702e0a820f31e3a94f9181 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 7 Dec 2024 18:56:49 +0000 Subject: [PATCH 2/6] test: update test assertions for unusual log patterns - Changed assertions to verify that invalid patterns are filtered out - Verify exactly one valid commit file remains - Remove incorrect unknown file type check --- kernel/src/log_segment/tests.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index e4ca7ec9f..4c8dcc434 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -521,7 +521,9 @@ fn test_list_files_with_unusual_patterns() { // bogus part numbering Path::from("_delta_log/00000000000000000000.checkpoint.0000000010.0000000000.parquet"), // v2 checkpoint - Path::from("_delta_log/00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json"), + Path::from( + "_delta_log/00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json", + ), // compacted log file Path::from("_delta_log/00000000000000000004.00000000000000000006.compacted.json"), // CRC file @@ -539,16 +541,13 @@ fn test_list_files_with_unusual_patterns() { // Collect the results and verify let paths: Vec<_> = result.unwrap().collect::, _>>().unwrap(); - // We should find at least the valid commit file - assert!(!paths.is_empty(), "Should find at least one valid file"); + // We should find exactly one file (the valid commit) + assert_eq!(paths.len(), 1, "Should find exactly one valid file"); // Verify that the valid commit file is included let has_valid_commit = paths.iter().any(|p| p.version == 2 && p.is_commit()); assert!(has_valid_commit, "Should find the valid commit file"); - // Verify that none of the paths resulted in errors - // The invalid patterns should have been filtered out by ParsedLogPath::try_from - for path in paths { - assert!(!path.is_unknown(), "Found unexpected unknown file type"); - } + // All other files should have been filtered out by ParsedLogPath::try_from + // and the filter_map_ok(identity) call } From 91f865042c770b17e57fe6cdd2ab6ee05e1ef942 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 7 Dec 2024 18:58:22 +0000 Subject: [PATCH 3/6] fix: filter out unknown file types in list_log_files - Add filter to only keep commits and checkpoints - Remove outdated TODO comment about .crc files - Ensures unrecognized file patterns are properly filtered out --- kernel/src/log_segment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 6a0292775..6b26def90 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -290,8 +290,8 @@ fn list_log_files( Ok(fs_client .list_from(&start_from)? .map(|meta| meta.map(|m| ParsedLogPath::try_from(m).ok().and_then(identity))) - // TODO this filters out .crc files etc which start with "." - how do we want to use these kind of files? .filter_map_ok(identity) + .filter(|path_res| path_res.as_ref().map_or(true, |path| path.is_commit() || path.is_checkpoint())) .take_while(move |path_res| match path_res { Ok(path) => !end_version.is_some_and(|end_version| end_version < path.version), Err(_) => true, From 8200c7bf8ab8e3957fdd74065a93385cc332c035 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 7 Dec 2024 19:03:33 +0000 Subject: [PATCH 4/6] fix: add filter for unknown file types in list_log_files - Add explicit filter to only keep commits and checkpoints - Ensures unrecognized file patterns are properly filtered out - Maintains existing error handling for other types of errors --- kernel/src/log_segment.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 6b26def90..c3bdceb79 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -291,7 +291,11 @@ fn list_log_files( .list_from(&start_from)? .map(|meta| meta.map(|m| ParsedLogPath::try_from(m).ok().and_then(identity))) .filter_map_ok(identity) - .filter(|path_res| path_res.as_ref().map_or(true, |path| path.is_commit() || path.is_checkpoint())) + .filter(|path_res| { + path_res + .as_ref() + .map_or(true, |path| path.is_commit() || path.is_checkpoint()) + }) .take_while(move |path_res| match path_res { Ok(path) => !end_version.is_some_and(|end_version| end_version < path.version), Err(_) => true, From 0a0227d9a0215a3f4873feeb1f514e3deca1c77a Mon Sep 17 00:00:00 2001 From: Calvin Giroud Date: Mon, 9 Dec 2024 11:56:37 -0500 Subject: [PATCH 5/6] address feedback, use try_collect --- kernel/src/log_segment/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 4c8dcc434..6f37a2e2b 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -539,7 +539,7 @@ fn test_list_files_with_unusual_patterns() { assert!(result.is_ok(), "list_log_files should not fail"); // Collect the results and verify - let paths: Vec<_> = result.unwrap().collect::, _>>().unwrap(); + let paths: Vec<_> = result.unwrap().try_collect().unwrap(); // We should find exactly one file (the valid commit) assert_eq!(paths.len(), 1, "Should find exactly one valid file"); From e4ae2a8d3c1629432e9678227b155c58cc6aaef6 Mon Sep 17 00:00:00 2001 From: Calvin Giroud Date: Mon, 9 Dec 2024 13:36:31 -0500 Subject: [PATCH 6/6] use flatten --- kernel/src/log_segment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index c3bdceb79..3b26ed87f 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -289,7 +289,7 @@ fn list_log_files( Ok(fs_client .list_from(&start_from)? - .map(|meta| meta.map(|m| ParsedLogPath::try_from(m).ok().and_then(identity))) + .map(|meta| meta.map(|m| ParsedLogPath::try_from(m).ok().flatten())) .filter_map_ok(identity) .filter(|path_res| { path_res