Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(clp-core): Add the write path for single-file archives. #646

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a5ef64f
first draft
davemarco Dec 28, 2024
615fafd
small changes
davemarco Dec 28, 2024
b84c2da
small changes
davemarco Dec 28, 2024
d1ec9fe
refactor
davemarco Dec 28, 2024
869f1b3
refactor
davemarco Dec 28, 2024
d84c002
refactor
davemarco Dec 28, 2024
f4136f8
refactor
davemarco Dec 28, 2024
6bbf12e
refactor
davemarco Dec 28, 2024
5c75147
remove extra line
davemarco Dec 28, 2024
c1f12df
fix lint
davemarco Dec 28, 2024
0ab6e0e
fix lint
davemarco Dec 28, 2024
d4ed4f6
fix lint
davemarco Dec 28, 2024
82b9802
fix lint?
davemarco Dec 29, 2024
393049b
fix lint?
davemarco Dec 29, 2024
5428403
fix lint
davemarco Dec 29, 2024
7e261f7
fix lint
davemarco Dec 29, 2024
0bd9b27
fix lint
davemarco Dec 29, 2024
d207630
haiqi review
davemarco Jan 13, 2025
c2fd37d
haiqi review
davemarco Jan 13, 2025
cdafb8d
haiqi review
davemarco Jan 16, 2025
dcadf04
haiqi review
davemarco Jan 16, 2025
265b4e4
add initializer for variable
davemarco Jan 16, 2025
c7361c2
attempt to fix lint
davemarco Jan 16, 2025
910e558
saving previous work
davemarco Jan 27, 2025
3ab46fd
merge
davemarco Feb 9, 2025
a12ac42
moving to new metadata, sfav2
davemarco Feb 9, 2025
4c26177
small changes
davemarco Feb 9, 2025
7916bb9
small changes
davemarco Feb 9, 2025
7fc5cb9
latest changes
davemarco Feb 9, 2025
0ae4822
small
davemarco Feb 9, 2025
1b7d73e
small changes
davemarco Feb 9, 2025
7b68544
small changes
davemarco Feb 9, 2025
05377b8
haiqi review
davemarco Feb 10, 2025
e0fcefe
small fix
davemarco Feb 10, 2025
d8a3c70
haiqi review remove offset function
davemarco Feb 11, 2025
5a0ec32
Apply suggestions from code review
davemarco Feb 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,9 @@ set(SOURCE_FILES_unitTest
src/clp/streaming_archive/reader/Segment.hpp
src/clp/streaming_archive/reader/SegmentManager.cpp
src/clp/streaming_archive/reader/SegmentManager.hpp
src/clp/streaming_archive/single_file_archive/Defs.hpp
src/clp/streaming_archive/single_file_archive/writer.cpp
src/clp/streaming_archive/single_file_archive/writer.hpp
src/clp/streaming_archive/writer/Archive.cpp
src/clp/streaming_archive/writer/Archive.hpp
src/clp/streaming_archive/writer/File.cpp
Expand Down
3 changes: 3 additions & 0 deletions components/core/src/clp/clp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ set(
../streaming_archive/reader/Segment.hpp
../streaming_archive/reader/SegmentManager.cpp
../streaming_archive/reader/SegmentManager.hpp
../streaming_archive/single_file_archive/Defs.hpp
../streaming_archive/single_file_archive/writer.cpp
../streaming_archive/single_file_archive/writer.hpp
../streaming_archive/writer/Archive.cpp
../streaming_archive/writer/Archive.hpp
../streaming_archive/writer/File.cpp
Expand Down
4 changes: 4 additions & 0 deletions components/core/src/clp/clp/CommandLineArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,10 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) {
->default_value(m_schema_file_path),
"Path to a schema file. If not specified, heuristics are used to determine "
"dictionary variables. See README-Schema.md for details."
)(
"single-file-archive",
po::bool_switch(&m_single_file_archive),
"Output archive as a single-file"
);

po::options_description all_compression_options;
Expand Down
4 changes: 4 additions & 0 deletions components/core/src/clp/clp/CommandLineArguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class CommandLineArguments : public CommandLineArgumentsBase {
explicit CommandLineArguments(std::string const& program_name)
: CommandLineArgumentsBase(program_name),
m_show_progress(false),
m_single_file_archive(false),
m_sort_input_files(true),
m_print_archive_stats_progress(false),
m_target_segment_uncompressed_size(1L * 1024 * 1024 * 1024),
Expand All @@ -45,6 +46,8 @@ class CommandLineArguments : public CommandLineArgumentsBase {

bool show_progress() const { return m_show_progress; }

[[nodiscard]] auto single_file_archive() const -> bool { return m_single_file_archive; }

bool sort_input_files() const { return m_sort_input_files; }

bool print_archive_stats_progress() const { return m_print_archive_stats_progress; }
Expand Down Expand Up @@ -92,6 +95,7 @@ class CommandLineArguments : public CommandLineArgumentsBase {
std::string m_output_dir;
std::string m_schema_file_path;
bool m_show_progress;
bool m_single_file_archive;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about m_use_single_file_archive?

bool m_print_archive_stats_progress;
size_t m_target_encoded_file_size;
size_t m_target_segment_uncompressed_size;
Expand Down
1 change: 1 addition & 0 deletions components/core/src/clp/clp/compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ bool compress(
archive_user_config.global_metadata_db = global_metadata_db.get();
archive_user_config.print_archive_stats_progress
= command_line_args.print_archive_stats_progress();
archive_user_config.use_single_file_archive = command_line_args.single_file_archive();

// Open Archive
streaming_archive::writer::Archive archive_writer;
Expand Down
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file shouldn't be called Defs.hpp since it's not a class/type. The existing Defs.hpp is a legacy that hasn't be refactored as yet.

I would suggest naming things as follows:

  • Namespace: single_file_archive_utils
  • writer.cpp/hpp -> writing.cpp/hpp
  • Defs.hpp -> Split into two files:
    • constants.hpp
    • types.hpp

Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP
#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP

#include <cstdint>
#include <string>

#include <msgpack.hpp>

#include "../ArchiveMetadata.hpp"
#include "../Constants.hpp"

namespace clp::streaming_archive::single_file_archive {
using single_file_archive_format_version_t = uint32_t;

// Single file archive version
constexpr uint8_t cVersionMajor{0};
constexpr uint8_t cVersionMinor{2};
constexpr uint16_t cVersionPatch{0};
Comment on lines +16 to +18
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this 0.2.0?

constexpr single_file_archive_format_version_t cVersion{
cVersionMajor << 24 | cVersionMinor << 16 | cVersionPatch
};

static constexpr size_t cNumMagicNumberChars{4};
static constexpr std::array<uint8_t, cNumMagicNumberChars>
cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'};
static constexpr std::string_view cUnstructuredSfaExtension{".clp"};
static constexpr std::string_view cFileInfoSentinelName{""};

static constexpr size_t cNumStaticFiles{6};
Comment on lines +23 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need static for constexpr values in header files. Technically, we need inline but that's for another PR.

constexpr std::array<char const*, cNumStaticFiles> cStaticArchiveFileNames{
cMetadataFileName,
cMetadataDBFileName,
cLogTypeDictFilename,
cLogTypeSegmentIndexFilename,
cVarDictFilename,
cVarSegmentIndexFilename
};

static constexpr size_t cNumUnused{6};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto


struct __attribute__((packed)) SingleFileArchiveHeader {
std::array<uint8_t, cNumMagicNumberChars> magic;
single_file_archive_format_version_t version;
uint64_t metadata_size;
std::array<uint64_t, cNumUnused> unused;
};

struct FileInfo {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This struct could do with a docstring (e.g. to explain that the offset is the offset within the SFA).

std::string name;
uint64_t offset;

// Variables are renamed when serialized to match single-file archive specification
MSGPACK_DEFINE_MAP(MSGPACK_NVP("n", name), MSGPACK_NVP("o", offset));
};

struct SingleFileArchiveMetadata {
std::vector<FileInfo> archive_files;
uint64_t num_segments;
MSGPACK_DEFINE_MAP(archive_files, num_segments);
};
} // namespace clp::streaming_archive::single_file_archive

#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#include "writer.hpp"

#include <cstddef>
#include <cstring>
#include <filesystem>
#include <sstream>
#include <vector>

#include <fmt/core.h>
#include <msgpack.hpp>
#include <spdlog.h>

#include "../../Defs.h"
#include "../../ErrorCode.hpp"
#include "../../FileReader.hpp"
#include "../../FileWriter.hpp"
#include "../../TraceableException.hpp"
#include "../ArchiveMetadata.hpp"
#include "../Constants.hpp"
#include "Defs.hpp"

namespace clp::streaming_archive::single_file_archive {

namespace {
constexpr size_t cReadBlockSize{4096};

/**
* Generates metadata for the file section of a single-file archive. The metadata consists
* of a list of file names and their corresponding starting offsets.
Comment on lines +28 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Generates metadata for the file section of a single-file archive. The metadata consists
* of a list of file names and their corresponding starting offsets.
* Generates metadata for the files section of a single-file archive.

I think the FileInfo struct already indicates the second sentence.

*
* @param multi_file_archive_path
* @param num_segments
* @return Vector containing a `FileInfo` struct for every file in the multi-file archive.
* @throws `std::filesystem::filesystem_error` if `stat` on an archive file fails.
*/
[[nodiscard]] auto
get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments)
-> std::vector<FileInfo>;

/**
* Combines file section metadata and the number of segments into single-file archive metadata,
* then serializes the metadata into the MessagePack format.
*
* @param multi_file_archive_path
* @param num_segments
* @return Packed metadata.
*/
[[nodiscard]] auto pack_single_file_archive_metadata(
std::filesystem::path const& multi_file_archive_path,
size_t num_segments
) -> std::stringstream;

/**
* Writes single-file archive header.
*
* @param single_file_archive_writer
* @param packed_metadata_size
*/
auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size)
-> void;

/**
* Writes packed single-file archive metadata.
*
* @param single_file_archive_writer
* @param packed_metadata Packed metadata.
*/
auto write_packed_archive_metadata(
FileWriter& single_file_archive_writer,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Output parameters should be last in a method. See the last sentence in this part of Google's style guide.

std::stringstream const& packed_metadata
) -> void;

/**
* Reads the content of a file and writes it to the given file writer.
* @param file_path
* @param single_file_archive_writer
* @throws OperationFailed if reading the file fails.
*/
auto
write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this method seems more general than what write_archive_file would suggest:

  • how about copy_file_contents?
  • how about file_writer?

Copy link
Contributor Author

@davemarco davemarco Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was considering moving this into existing writer interface or new utility file. Like adding a new method copy(ReaderInterface) to WriterInterface.hpp? Or potentially a new file copy.cpp that has a method copy( ReaderInterface, WriterInterface).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about moving it into new io_utils.cpp/hpp files?

-> void;

/**
* Iterates over files in the multi-file archive and copies their contents to the single-file
* archive.
*
* @param single_file_archive_writer
* @param multi_file_archive_path
* @param num_segments
*/
auto write_archive_files(
FileWriter& single_file_archive_writer,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto

std::filesystem::path const& multi_file_archive_path,
size_t num_segments
) -> void;

auto
get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments)
-> std::vector<FileInfo> {
std::vector<FileInfo> files;

uint64_t offset{};
for (auto const& static_archive_file_name : cStaticArchiveFileNames) {
files.emplace_back(FileInfo{std::string(static_archive_file_name), offset});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
files.emplace_back(FileInfo{std::string(static_archive_file_name), offset});
files.emplace_back(std::string{static_archive_file_name}, offset);

emplace_back forwards the parameters to the object's constructor, right?

offset += std::filesystem::file_size(multi_file_archive_path / static_archive_file_name);
}

std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname;
auto segment_dir_path = multi_file_archive_path / cSegmentsDirname;

for (size_t i = 0; i < num_segments; ++i) {
auto const segment_id = std::to_string(i);
files.emplace_back(FileInfo{segment_id, offset});
offset += std::filesystem::file_size(segment_dir_path / segment_id);
}

// Add sentinel indicating total size of all files
files.emplace_back(FileInfo{std::string(cFileInfoSentinelName), offset});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
files.emplace_back(FileInfo{std::string(cFileInfoSentinelName), offset});
files.emplace_back(std::string{cFileInfoSentinelName}, offset);


return files;
}

auto pack_single_file_archive_metadata(
std::filesystem::path const& multi_file_archive_path,
size_t num_segments
) -> std::stringstream {
SingleFileArchiveMetadata single_file_archive{
.archive_files = get_archive_file_infos(multi_file_archive_path, num_segments),
.num_segments = num_segments,
Comment on lines +126 to +127
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since archive_files and num_segments both use num_segments, they are inherently related; this means SingleFileArchiveMetadata should be a class.

};

std::stringstream buf;
msgpack::pack(buf, single_file_archive);

return buf;
}

auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size)
-> void {
SingleFileArchiveHeader header{
.magic = cUnstructuredSfaMagicNumber,
.version = cVersion,
.metadata_size = packed_metadata_size,
.unused{}
};
single_file_archive_writer.write(reinterpret_cast<char const*>(&header), sizeof(header));
}

auto write_packed_archive_metadata(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need this function?

FileWriter& single_file_archive_writer,
std::stringstream const& packed_metadata
) -> void {
single_file_archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size());
}

auto
write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer)
-> void {
FileReader reader{file_path.string()};
std::array<char, cReadBlockSize> read_buffer{};
while (true) {
size_t num_bytes_read{};
auto const error_code = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read);
if (ErrorCode_EndOfFile == error_code) {
break;
}
if (ErrorCode_Success != error_code) {
throw OperationFailed(error_code, __FILENAME__, __LINE__);
}
single_file_archive_writer.write(read_buffer.data(), num_bytes_read);
}
}

auto write_archive_files(
FileWriter& single_file_archive_writer,
std::filesystem::path const& multi_file_archive_path,
size_t num_segments
) -> void {
for (auto const& static_archive_file_name : cStaticArchiveFileNames) {
auto const static_archive_file_path{multi_file_archive_path / static_archive_file_name};
write_archive_file(static_archive_file_path, single_file_archive_writer);
}

auto const segment_dir_path{multi_file_archive_path / cSegmentsDirname};
for (size_t i = 0; i < num_segments; ++i) {
auto const segment_path{segment_dir_path / std::to_string(i)};
write_archive_file(segment_path, single_file_archive_writer);
}
}
} // namespace

auto
write_single_file_archive(std::filesystem::path const& multi_file_archive_path, size_t num_segments)
-> void {
FileWriter single_file_archive_writer;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move closer to first use?

std::filesystem::path single_file_archive_path{
multi_file_archive_path.string()
+ std::string(single_file_archive::cUnstructuredSfaExtension)
};
if (std::filesystem::exists(single_file_archive_path)) {
throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
}

single_file_archive_writer.open(
single_file_archive_path.string(),
FileWriter::OpenMode::CREATE_FOR_WRITING
);

auto const packed_metadata
= pack_single_file_archive_metadata(multi_file_archive_path, num_segments);
write_archive_header(single_file_archive_writer, packed_metadata.str().size());
write_packed_archive_metadata(single_file_archive_writer, packed_metadata);
write_archive_files(single_file_archive_writer, multi_file_archive_path, num_segments);

single_file_archive_writer.close();

try {
std::filesystem::remove_all(multi_file_archive_path);
} catch (std::filesystem::filesystem_error& e) {
SPDLOG_WARN("Failed to delete multi-file archive: {}", e.what());
}
}
} // namespace clp::streaming_archive::single_file_archive
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP
#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP

#include <filesystem>
#include <string>

#include <msgpack.hpp>

#include "../../Defs.h"
#include "../../ErrorCode.hpp"
#include "../../TraceableException.hpp"
#include "../ArchiveMetadata.hpp"

namespace clp::streaming_archive::single_file_archive {

class OperationFailed : public TraceableException {
public:
// Constructors
OperationFailed(
ErrorCode error_code,
char const* const filename,
int line_number,
std::string message = "streaming_archive::single_file_archive operation failed"
)
: TraceableException{error_code, filename, line_number},
m_message{std::move(message)} {}

// Methods
[[nodiscard]] auto what() const noexcept -> char const* override { return m_message.c_str(); }

private:
std::string m_message;
};

/**
* Writes a single-file archive then deletes the multi-file archive.
*
* @param multi_file_archive_path
* @param num_segments
* @throws OperationFailed if single-file archive path already exists.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::filesystem::exists and std::filesystem::remove_all can also throw, right?

*/
auto
write_single_file_archive(std::filesystem::path const& multi_file_archive_path, size_t num_segments)
-> void;

} // namespace clp::streaming_archive::single_file_archive

#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP
Loading
Loading