From 17cf63f071df27c0cc09a4aa1be843d8871f1927 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 4 Oct 2024 15:47:26 +0100 Subject: [PATCH] Working fix --- cpp/src/arrow/filesystem/azurefs.cc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d407b1654f5b5..ba5a56fb0c619 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1806,6 +1806,8 @@ class AzureFileSystem::Impl { // BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can // distinguish between a directory and a file by checking if we received a // prefix or a blob. + // This strategy allows us to implement GetFileInfo with just 1 blob storage + // operation in almost every case. if (!list_response.BlobPrefixes.empty()) { // Ensure the returned BlobPrefixes[0] string doesn't contain more characters than // the requested Prefix. For instance, if we request with Prefix="dir/abra" and @@ -1821,12 +1823,32 @@ class AzureFileSystem::Impl { } if (!list_response.Blobs.empty()) { const auto& blob = list_response.Blobs[0]; + const auto next_character = blob.Name[options.Prefix.Value().length()]; if (blob.Name == location.path) { info.set_type(FileType::File); info.set_size(blob.BlobSize); info.set_mtime( std::chrono::system_clock::time_point{blob.Details.LastModified}); return info; + } else if (next_character < '/') { + // First list result did not indicate a directory and there is definitely no + // exactly matching blob. However, there may still be a directory that we + // initially missed because the first list result came before + // `options.Prefix + '/'` lexigraphically. + // For example the flat namespace storage account has the following blobs: + // - container/dir.txt + // - container/dir/file.txt + // GetFileInfo(container/dir) should return FileType::Directory but in this + // edge case `blob = "dir.txt"`, so without further checks we would incorrectly + // return FileType::NotFound. + // Therefore we make an extra list operation with the trailing slash to confirm + // whether the path is a directory. + options.Prefix = internal::EnsureTrailingSlash(location.path); + auto list_with_trailing_slash_response = container_client.ListBlobs(options); + if (!list_with_trailing_slash_response.Blobs.empty()) { + info.set_type(FileType::Directory); + return info; + } } } info.set_type(FileType::NotFound);