From a5ef64f6b435a8da41d1b15e28f3b1780f6aa4a1 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 01:46:41 +0000 Subject: [PATCH 01/35] first draft --- .../core/src/clp/clp/CommandLineArguments.cpp | 4 + .../core/src/clp/clp/CommandLineArguments.hpp | 4 + .../core/src/clp/clp/FileCompressor.cpp | 12 +- components/core/src/clp/clp/compression.cpp | 9 +- .../clp/streaming_archive/ArchiveMetadata.hpp | 2 + .../single_file_archive/Defs.hpp | 87 ++++++++ .../single_file_archive/writer.cpp | 188 ++++++++++++++++++ .../single_file_archive/writer.hpp | 103 ++++++++++ .../clp/streaming_archive/writer/Archive.cpp | 161 ++++++++++++++- .../clp/streaming_archive/writer/Archive.hpp | 9 + 10 files changed, 572 insertions(+), 7 deletions(-) create mode 100644 components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp create mode 100644 components/core/src/clp/streaming_archive/single_file_archive/writer.cpp create mode 100644 components/core/src/clp/streaming_archive/single_file_archive/writer.hpp diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index cb44d96d8..3824b1107 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -373,6 +373,10 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { ->default_value(m_schema_file_path), "Path to a schema file. If not specified, heuristics are used to determine " "dictionary variables. See README-Schema.md for details." + )( + "single-file-archive", + po::bool_switch(&m_single_file_archive), + "Compress using single-file format." ); po::options_description all_compression_options; diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp index 6e14a4b3b..254373869 100644 --- a/components/core/src/clp/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/clp/CommandLineArguments.hpp @@ -25,6 +25,7 @@ class CommandLineArguments : public CommandLineArgumentsBase { m_show_progress(false), m_sort_input_files(true), m_print_archive_stats_progress(false), + m_single_file_archive(false), m_target_segment_uncompressed_size(1L * 1024 * 1024 * 1024), m_target_encoded_file_size(512L * 1024 * 1024), m_target_data_size_of_dictionaries(100L * 1024 * 1024), @@ -49,6 +50,8 @@ class CommandLineArguments : public CommandLineArgumentsBase { bool print_archive_stats_progress() const { return m_print_archive_stats_progress; } + bool get_single_file_archive() const { return m_single_file_archive; } + size_t get_target_encoded_file_size() const { return m_target_encoded_file_size; } size_t get_target_segment_uncompressed_size() const { @@ -93,6 +96,7 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string m_schema_file_path; bool m_show_progress; bool m_print_archive_stats_progress; + bool m_single_file_archive; size_t m_target_encoded_file_size; size_t m_target_segment_uncompressed_size; size_t m_target_data_size_of_dictionaries; diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 9898602cc..d71f8ac6d 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -243,7 +243,9 @@ void FileCompressor::parse_and_encode_with_heuristic( // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive_writer.get_use_single_file_archive()) + { split_file_and_archive( archive_user_config, path_for_compression, @@ -337,7 +339,9 @@ bool FileCompressor::try_compressing_as_archive( parent_directories.emplace(file_parent_path); } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } @@ -537,7 +541,9 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( } // Split archive/encoded file if necessary before writing the new event - if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive.get_use_single_file_archive()) + { split_file_and_archive( archive_user_config, path, diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index a0d5bf276..7fb9d844e 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -107,6 +107,7 @@ bool compress( archive_user_config.global_metadata_db = global_metadata_db.get(); archive_user_config.print_archive_stats_progress = command_line_args.print_archive_stats_progress(); + archive_user_config.use_single_file_archive = command_line_args.get_single_file_archive(); // Open Archive streaming_archive::writer::Archive archive_writer; @@ -135,7 +136,9 @@ bool compress( ); } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } if (false @@ -163,7 +166,9 @@ bool compress( file_group_id_comparator); // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } if (false diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index 1a1edc894..420462165 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -91,6 +91,8 @@ class ArchiveMetadata { private: // Variables archive_format_version_t m_archive_format_version{cArchiveFormatVersion}; + std::string m_variable_encoding_methods_version{}; + std::string m_variables_schema_version{}; std::string m_creator_id; uint16_t m_creator_id_len{0}; uint64_t m_creation_idx{0}; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp new file mode 100644 index 000000000..99555d309 --- /dev/null +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -0,0 +1,87 @@ +#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP +#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP + +#include +#include + +#include "../clp/Defs.h" +#include "../Constants.hpp" +#include "msgpack.hpp" +#include "streaming_archive/ArchiveMetadata.hpp" + +namespace clp::streaming_archive::single_file_archive { + +using single_file_archive_format_version_t = uint32_t; + +// Single file archive version. +constexpr uint8_t cArchiveMajorVersion{0}; +constexpr uint8_t cArchiveMinorVersion{2}; +constexpr uint16_t cArchivePatchVersion{0}; +constexpr single_file_archive_format_version_t cArchiveVersion{ + cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion +}; + +static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; + +static constexpr std::string_view cUnstructuredSfaExtension = ".clp"; + +enum class CompressionType : uint16_t { + Passthrough = 0, + Zstd, + LZMA +}; + +static constexpr std::string_view cCompressionTypeZstd = "ZSTD"; + +struct __attribute__((packed)) SingleFileArchiveHeader { + uint8_t magic[4]; + single_file_archive_format_version_t version; + uint64_t metadata_size; + uint64_t unused[6]; +}; + +constexpr char const* static_archive_file_names[] + = {cMetadataDBFileName, + cLogTypeDictFilename, + cLogTypeSegmentIndexFilename, + cVarDictFilename, + cVarSegmentIndexFilename}; + +struct FileInfo { + std::string n; + uint64_t o; + MSGPACK_DEFINE_MAP(n, o); +}; + +struct ArchiveMetadata { + single_file_archive_format_version_t archive_format_version; + std::string variable_encoding_methods_version; + std::string variables_schema_version; + std::string compression_type; + std::string creator_id; + epochtime_t begin_timestamp; + epochtime_t end_timestamp; + uint64_t uncompressed_size; + uint64_t compressed_size; + MSGPACK_DEFINE_MAP( + archive_format_version, + variable_encoding_methods_version, + variables_schema_version, + compression_type, + creator_id, + begin_timestamp, + end_timestamp, + uncompressed_size, + compressed_size + ); +}; + +struct SingleFileArchiveMetadata { + std::vector archive_files; + ArchiveMetadata archive_metadata; + uint64_t num_segments; + MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments); +}; +} // namespace clp::streaming_archive::single_file_archive + +#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp new file mode 100644 index 000000000..212c367af --- /dev/null +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -0,0 +1,188 @@ +#include "writer.hpp" + +#include +#include +#include +#include + +#include +#include + +#include "../../ErrorCode.hpp" +#include "../../ffi/encoding_methods.hpp" +#include "../../TraceableException.hpp" +#include "../ArchiveMetadata.hpp" +#include "../Constants.hpp" +#include "Defs.hpp" + +namespace clp::streaming_archive::single_file_archive { + +namespace { + +constexpr size_t cReadBlockSize = 4096; + +/** + * Gets the size of a file specified by `file_path` and adds it to `offset`. + * @param file_path + * @param offset The current offset of the single file archive. + * @throws OperationFailed if error getting file size. + */ +void update_offset(std::filesystem::path const& file_path, uint64_t& offset); + +/** + * Reads the content of a file and writes it to the archive. + * @param file_path Path to the file to be read. + * @param archive_writer Writer to write the file content to the archive. + * @throws OperationFailed if reading the file fails. + */ +void write_archive_file(std::string const& file_path, FileWriter& archive_writer); + +void update_offset(std::filesystem::path const& file_path, uint64_t& offset) { + try { + auto size = std::filesystem::file_size(file_path); + offset += size; + } catch (std::filesystem::filesystem_error const& e) { + throw OperationFailed( + ErrorCode_Failure, + __FILENAME__, + __LINE__, + fmt::format("Failed to get file size: {}", e.what()) + ); + } +} + +auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) + -> void { + FileReader reader(file_path.string()); + std::array read_buffer{}; + while (true) { + size_t num_bytes_read{0}; + ErrorCode const error_code + = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); + if (ErrorCode_EndOfFile == error_code) { + break; + } + if (ErrorCode_Success != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + archive_writer.write(read_buffer.data(), num_bytes_read); + } +} +} // namespace + +auto get_segment_ids(segment_id_t last_segment_id) -> std::vector { + std::vector segment_ids; + + if (last_segment_id < 0) { + return segment_ids; + } + + for (size_t i = 0; i <= last_segment_id; ++i) { + segment_ids.emplace_back(std::to_string(i)); + } + + return segment_ids; +} + +auto get_file_infos( + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::vector { + std::vector files; + uint64_t offset = 0; + + for (auto const& static_archive_file_name : static_archive_file_names) { + files.emplace_back(static_archive_file_name, offset); + update_offset(multi_file_archive_path / static_archive_file_name, offset); + } + + std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; + for (auto const& segment_id : segment_ids) { + files.emplace_back(segment_id, offset); + update_offset(segment_dir_path / segment_id, offset); + } + + // Add sentinel indicating total size of all files. + files.emplace_back(FileInfo{"", offset}); + + // Print the file paths and offsets to ensure it works + // Remove this later + for (auto const& file : files) { + std::cout << "File: " << file.n << ", Offset: " << file.o << std::endl; + } + + return files; +} + +auto pack_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::vector const& file_infos, + uint64_t const num_segments +) -> std::stringstream { + ArchiveMetadata archive_metadata{ + .archive_format_version = multi_file_archive_metadata.get_archive_format_version(), + // TODO: The following fields are required for single-file archive metadata format; + // however, they are not yet implemented in multi-file archive metadata. Currently, + // they are set to default values, but should reference the actual values from the + // multi-file archive metadata once implemented. + // - variable_encoding_methods_version + // - variables_schema_version + // - compression_type + .variable_encoding_methods_version + = static_cast(ffi::cVariableEncodingMethodsVersion), + .variables_schema_version = static_cast(ffi::cVariablesSchemaVersion), + .compression_type = std::string(cCompressionTypeZstd), + .creator_id = multi_file_archive_metadata.get_creator_id(), + .begin_timestamp = multi_file_archive_metadata.get_begin_timestamp(), + .end_timestamp = multi_file_archive_metadata.get_end_timestamp(), + .uncompressed_size = multi_file_archive_metadata.get_uncompressed_size_bytes(), + .compressed_size = multi_file_archive_metadata.get_compressed_size_bytes(), + }; + + SingleFileArchiveMetadata single_file_archive{ + .archive_files = file_infos, + .archive_metadata = archive_metadata, + .num_segments = num_segments, + }; + + std::stringstream buf; + msgpack::pack(buf, single_file_archive); + + return buf; +} + +auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void { + SingleFileArchiveHeader header{ + .magic{}, + .version = cArchiveVersion, + .metadata_size = packed_metadata_size, + .unused{} + }; + std::memcpy(&header.magic, cUnstructuredSfaMagicNumber.data(), sizeof(header.magic)); + + auto char_buffer = std::bit_cast>(header); + archive_writer.write(char_buffer.data(), char_buffer.size()); +} + +auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) + -> void { + archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); +} + +auto write_archive_files( + FileWriter& archive_writer, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> void { + for (auto const& file_name : static_archive_file_names) { + std::filesystem::path static_archive_file_path = multi_file_archive_path / file_name; + write_archive_file(static_archive_file_path, archive_writer); + } + + std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; + for (auto const& segment_id : segment_ids) { + std::filesystem::path segment_path = segment_dir_path / segment_id; + write_archive_file(segment_path, archive_writer); + } +} +} // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp new file mode 100644 index 000000000..e302c872f --- /dev/null +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -0,0 +1,103 @@ +#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP +#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP + +#include +#include +#include + +#include + +#include "../../ErrorCode.hpp" +#include "../../FileWriter.hpp" +#include "../../TraceableException.hpp" +#include "../ArchiveMetadata.hpp" +#include "Defs.hpp" + +namespace clp::streaming_archive::single_file_archive { + +class OperationFailed : public TraceableException { +public: + // Constructors + OperationFailed( + ErrorCode error_code, + char const* const filename, + int line_number, + std::string message + = "streaming_archive::writer::single_file_archive::utils operation failed" + ) + : TraceableException{error_code, filename, line_number}, + m_message{std::move(message)} {} + + // Methods + [[nodiscard]] auto what() const noexcept -> char const* override { return m_message.c_str(); } + +private: + std::string m_message; +}; + +/** + * @param last_segment_id ID of last written segment. + * @return Vector of segment IDs. + */ +auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; + +/** + * @param multi_file_archive_path Path to the multi-file archive. + * @param segment_ids Vector of segment IDs. + * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. + * @throws OperationFailed if error getting file size. + */ +auto get_file_infos( + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::vector; + +/** + * Serializes single-file archive metadata into MsgPack. + * + * @param multi_file_archive_metadata Multi-file archive metadata. + * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. + * @param num_segments + * @return Packed metadata. + */ +auto pack_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::vector const& file_infos, + uint64_t num_segments +) -> std::stringstream; + +/** + * Writes single-file archive header. + * + * @param archive_writer + * @param metadata_size + */ +auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void; + +/** + * Writes single-file archive metadata. + * + * @param archive_writer + * @param packed_metadata Packed metadata. + */ +auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) + -> void; + +/** + * Iterates over files in the multi-file archive copying their contents to the single-file archive. + * Skips metadata since already written in `write_archive_metadata`. + * + * @param archive_writer + * @param multi_file_archive_path + * @param segment_ids Vector of segment IDs. + * @throws OperationFailed if reading a file fails. + */ +auto write_archive_files( + FileWriter& archive_writer, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> void; + +} // namespace clp::streaming_archive::single_file_archive + +#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 6804fac7a..cfe26594b 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -13,12 +13,15 @@ #include #include #include +#include #include "../../EncodedVariableInterpreter.hpp" #include "../../ir/types.hpp" #include "../../spdlog_with_specializations.hpp" #include "../../Utils.hpp" #include "../Constants.hpp" +#include "../single_file_archive/Defs.hpp" +#include "../single_file_archive/writer.hpp" #include "utils.hpp" using clp::ir::eight_byte_encoded_variable_t; @@ -56,6 +59,7 @@ void Archive::open(UserConfig const& user_config) { m_creator_id_as_string = boost::uuids::to_string(m_creator_id); m_creation_num = user_config.creation_num; m_print_archive_stats_progress = user_config.print_archive_stats_progress; + m_use_single_file_archive = user_config.use_single_file_archive; std::error_code std_error_code; @@ -225,6 +229,7 @@ void Archive::close() { // Persist all metadata including dictionaries write_dir_snapshot(); + // I dont understand this code we already closed in write dir snapshot? m_logtype_dict.close(); m_logtype_dict_entry.clear(); m_var_dict.close(); @@ -242,6 +247,10 @@ void Archive::close() { m_metadata_db.close(); + if (m_use_single_file_archive) { + write_single_file_archive(); + } + m_creator_id_as_string.clear(); m_id_as_string.clear(); m_path.clear(); @@ -330,7 +339,9 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { m_old_ts_pattern = timestamp_pattern; } } - if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { + if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts + && false == m_use_single_file_archive) + { split_file_and_archive( m_archive_user_config, m_path_for_compression, @@ -560,7 +571,9 @@ void Archive::persist_file_metadata(vector const& files) { m_metadata_db.update_files(files); - m_global_metadata_db->update_metadata_for_files(m_id_as_string, files); + if (false == m_use_single_file_archive) { + m_global_metadata_db->update_metadata_for_files(m_id_as_string, files); + } // Mark files' metadata as clean for (auto file : files) { @@ -650,6 +663,150 @@ void Archive::update_metadata() { } } +void print_msgpack_object(msgpack::object const& obj, int depth = 0) { + // Indentation for better readability + std::string indent(depth * 2, ' '); + + switch (obj.type) { + case msgpack::type::NIL: + std::cout << indent << "NIL" << std::endl; + break; + case msgpack::type::BOOLEAN: + std::cout << indent << "BOOLEAN: " << (obj.as() ? "true" : "false") << std::endl; + break; + case msgpack::type::POSITIVE_INTEGER: + case msgpack::type::NEGATIVE_INTEGER: + std::cout << indent << "Integer: " << obj.as() << std::endl; + break; + case msgpack::type::FLOAT32: + case msgpack::type::FLOAT64: + std::cout << indent << "Float: " << obj.as() << std::endl; + break; + case msgpack::type::STR: + std::cout << indent << "String: " << obj.as() << std::endl; + break; + case msgpack::type::BIN: + std::cout << indent << "Binary (size " << obj.via.bin.size << "): "; + for (size_t i = 0; i < obj.via.bin.size; ++i) { + std::cout << std::hex << static_cast(obj.via.bin.ptr[i]) << " "; + } + std::cout << std::dec << std::endl; + break; + case msgpack::type::ARRAY: + std::cout << indent << "Array (size " << obj.via.array.size << "):" << std::endl; + for (size_t i = 0; i < obj.via.array.size; ++i) { + print_msgpack_object( + obj.via.array.ptr[i], + depth + 1 + ); // Recursively print array elements + } + break; + case msgpack::type::MAP: + std::cout << indent << "Map (size " << obj.via.map.size << "):" << std::endl; + for (size_t i = 0; i < obj.via.map.size; ++i) { + // Print key + std::cout << indent << " Key " << i + 1 << ": "; + print_msgpack_object(obj.via.map.ptr[i].key, depth + 1); + + // Print value + std::cout << indent << " Value " << i + 1 << ": "; + print_msgpack_object(obj.via.map.ptr[i].val, depth + 1); + } + break; + default: + std::cout << indent << "Unknown type" << std::endl; + break; + } +} + +void unpackMetadata(std::string const& packed_data) { + msgpack::object_handle oh = msgpack::unpack(packed_data.data(), packed_data.size()); + msgpack::object obj = oh.get(); + print_msgpack_object(obj); + + single_file_archive::SingleFileArchiveMetadata metadata; + obj.convert(metadata); + + // Print archive files + std::cout << "\nArchive Files:" << std::endl; + for (auto const& file : metadata.archive_files) { + std::cout << " Name: " << file.n << ", Offset: " << file.o << std::endl; + } + + // Print archive metadata + std::cout << "\nArchive Metadata:" << std::endl; + std::cout << " Archive Format Version: " << metadata.archive_metadata.archive_format_version + << std::endl; + std::cout << " Begin Timestamp: " << metadata.archive_metadata.begin_timestamp << std::endl; + std::cout << " Compressed Size: " << metadata.archive_metadata.compressed_size << std::endl; + std::cout << " Compression Type: " << metadata.archive_metadata.compression_type << std::endl; + std::cout << " Creator ID: " << metadata.archive_metadata.creator_id << std::endl; + std::cout << " End Timestamp: " << metadata.archive_metadata.end_timestamp << std::endl; + std::cout << " Uncompressed Size: " << metadata.archive_metadata.uncompressed_size + << std::endl; + std::cout << " Variable Encoding Methods Version: " + << metadata.archive_metadata.variable_encoding_methods_version << std::endl; + std::cout << " Variables Schema Version: " + << metadata.archive_metadata.variables_schema_version << std::endl; + + // Print num segments + std::cout << "\nNum Segments: " << metadata.num_segments << std::endl; +} + +void Archive::write_single_file_archive() { + std::filesystem::path multi_file_archive_path = m_path; + + auto segment_ids = single_file_archive::get_segment_ids(m_next_segment_id - 1); + + // Put this all together + auto file_infos = single_file_archive::get_file_infos(multi_file_archive_path, segment_ids); + + if (false == m_local_metadata.has_value()) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + auto& multi_file_archive_metadata = m_local_metadata.value(); + + auto packed_metadata = single_file_archive::pack_single_file_archive_metadata( + multi_file_archive_metadata, + file_infos, + m_next_segment_id + ); + + msgpack::object_handle oh + = msgpack::unpack(packed_metadata.str().data(), packed_metadata.str().size()); + msgpack::object obj = oh.get(); + print_msgpack_object(obj); + + // Return packed metadata + + // Put this all together + FileWriter archive_writer; + std::filesystem::path single_file_archive_path + = multi_file_archive_path.string() + + std::string(single_file_archive::cUnstructuredSfaExtension); + + if (std::filesystem::exists(single_file_archive_path)) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + archive_writer.open( + single_file_archive_path.string(), + FileWriter::OpenMode::CREATE_FOR_WRITING + ); + + single_file_archive::write_archive_header(archive_writer, packed_metadata.str().size()); + single_file_archive::write_archive_metadata(archive_writer, packed_metadata); + single_file_archive::write_archive_files(archive_writer, multi_file_archive_path, segment_ids); + + archive_writer.close(); + try { + // std::filesystem::remove_all(multi_file_archive_path); + } catch (std::filesystem::filesystem_error& e) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + // Here +} + // Explicitly declare template specializations so that we can define the template methods in this // file template void Archive::write_log_event_ir( diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index cd5c5d99f..b5ba0cafd 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -48,6 +48,7 @@ class Archive { std::string output_dir; GlobalMetadataDB* global_metadata_db; bool print_archive_stats_progress; + bool use_single_file_archive; }; class OperationFailed : public TraceableException { @@ -193,6 +194,8 @@ class Archive { return m_logtype_dict.get_data_size() + m_var_dict.get_data_size(); } + bool get_use_single_file_archive() const { return m_use_single_file_archive; } + private: // Types /** @@ -279,6 +282,11 @@ class Archive { */ void update_metadata(); + /** + * Writes the archive to a single file. + */ + void write_single_file_archive(); + // Variables boost::uuids::uuid m_id; std::string m_id_as_string; @@ -341,6 +349,7 @@ class Archive { GlobalMetadataDB* m_global_metadata_db; bool m_print_archive_stats_progress; + bool m_use_single_file_archive; }; } // namespace clp::streaming_archive::writer From 615fafdf68281c918957caa2a1416747850844d7 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 02:21:38 +0000 Subject: [PATCH 02/35] small changes --- components/core/src/clp/clp/CommandLineArguments.cpp | 2 +- components/core/src/clp/clp/CommandLineArguments.hpp | 8 ++++---- components/core/src/clp/clp/FileCompressor.cpp | 8 ++------ components/core/src/clp/clp/compression.cpp | 2 +- .../core/src/clp/streaming_archive/writer/Archive.cpp | 4 +--- .../core/src/clp/streaming_archive/writer/utils.cpp | 4 ++++ 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index 3824b1107..679734b11 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -376,7 +376,7 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { )( "single-file-archive", po::bool_switch(&m_single_file_archive), - "Compress using single-file format." + "Output archive as a single-file" ); po::options_description all_compression_options; diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp index 254373869..32e19432c 100644 --- a/components/core/src/clp/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/clp/CommandLineArguments.hpp @@ -23,9 +23,9 @@ class CommandLineArguments : public CommandLineArgumentsBase { explicit CommandLineArguments(std::string const& program_name) : CommandLineArgumentsBase(program_name), m_show_progress(false), + m_single_file_archive(false), m_sort_input_files(true), m_print_archive_stats_progress(false), - m_single_file_archive(false), m_target_segment_uncompressed_size(1L * 1024 * 1024 * 1024), m_target_encoded_file_size(512L * 1024 * 1024), m_target_data_size_of_dictionaries(100L * 1024 * 1024), @@ -46,12 +46,12 @@ class CommandLineArguments : public CommandLineArgumentsBase { bool show_progress() const { return m_show_progress; } + bool get_use_single_file_archive() const { return m_single_file_archive; } + bool sort_input_files() const { return m_sort_input_files; } bool print_archive_stats_progress() const { return m_print_archive_stats_progress; } - bool get_single_file_archive() const { return m_single_file_archive; } - size_t get_target_encoded_file_size() const { return m_target_encoded_file_size; } size_t get_target_segment_uncompressed_size() const { @@ -95,8 +95,8 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string m_output_dir; std::string m_schema_file_path; bool m_show_progress; - bool m_print_archive_stats_progress; bool m_single_file_archive; + bool m_print_archive_stats_progress; size_t m_target_encoded_file_size; size_t m_target_segment_uncompressed_size; size_t m_target_data_size_of_dictionaries; diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index d71f8ac6d..952bd4e74 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -243,9 +243,7 @@ void FileCompressor::parse_and_encode_with_heuristic( // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_file_and_archive( archive_user_config, path_for_compression, @@ -541,9 +539,7 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( } // Split archive/encoded file if necessary before writing the new event - if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive.get_use_single_file_archive()) - { + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_file_and_archive( archive_user_config, path, diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index 7fb9d844e..cd4619eaa 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -107,7 +107,7 @@ bool compress( archive_user_config.global_metadata_db = global_metadata_db.get(); archive_user_config.print_archive_stats_progress = command_line_args.print_archive_stats_progress(); - archive_user_config.use_single_file_archive = command_line_args.get_single_file_archive(); + archive_user_config.use_single_file_archive = command_line_args.get_use_single_file_archive(); // Open Archive streaming_archive::writer::Archive archive_writer; diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index cfe26594b..9ea195760 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -339,9 +339,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { m_old_ts_pattern = timestamp_pattern; } } - if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts - && false == m_use_single_file_archive) - { + if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { split_file_and_archive( m_archive_user_config, m_path_for_compression, diff --git a/components/core/src/clp/streaming_archive/writer/utils.cpp b/components/core/src/clp/streaming_archive/writer/utils.cpp index 60a7c2448..726175e65 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.cpp +++ b/components/core/src/clp/streaming_archive/writer/utils.cpp @@ -12,6 +12,10 @@ using std::string; namespace clp::streaming_archive::writer { auto split_archive(Archive::UserConfig& archive_user_config, Archive& archive_writer) -> void { + if (archive_writer.get_use_single_file_archive()) { + SPDLOG_WARN("Single-file archive cannot be split."); + return; + } archive_writer.close(); archive_user_config.id = boost::uuids::random_generator()(); ++archive_user_config.creation_num; From b84c2daa9a9617db5f12c37987775b273309bdb1 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 02:25:21 +0000 Subject: [PATCH 03/35] small changes --- components/core/src/clp/clp/FileCompressor.cpp | 4 +--- components/core/src/clp/clp/compression.cpp | 8 ++------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 952bd4e74..9898602cc 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -337,9 +337,7 @@ bool FileCompressor::try_compressing_as_archive( parent_directories.emplace(file_parent_path); } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_archive(archive_user_config, archive_writer); } diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index cd4619eaa..88bec8784 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -136,9 +136,7 @@ bool compress( ); } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } if (false @@ -166,9 +164,7 @@ bool compress( file_group_id_comparator); // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } if (false From d1ec9fe541a4131df0de3d2bcfcc07afcb14fddd Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 03:55:10 +0000 Subject: [PATCH 04/35] refactor --- .../clp/streaming_archive/ArchiveMetadata.hpp | 15 +++++++- .../single_file_archive/Defs.hpp | 23 +++++------- .../single_file_archive/writer.cpp | 37 ++++++++----------- .../single_file_archive/writer.hpp | 10 ++++- .../clp/streaming_archive/writer/Archive.cpp | 9 ++--- 5 files changed, 49 insertions(+), 45 deletions(-) diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index 420462165..c31b8318e 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -7,6 +7,7 @@ #include "../FileReader.hpp" #include "../FileWriter.hpp" #include "Constants.hpp" +#include "../ffi/encoding_methods.hpp" namespace clp::streaming_archive { /** @@ -79,6 +80,12 @@ class ArchiveMetadata { [[nodiscard]] auto get_end_timestamp() const { return m_end_timestamp; } + [[nodiscard]] auto get_variable_encoding_methods_version() const -> std::string const& { return m_variable_encoding_methods_version; } + + [[nodiscard]] auto get_variables_schema_version() const -> std::string const& { return m_variables_schema_version; } + + [[nodiscard]] auto get_compression_type() const -> std::string const& { return m_compression_type; } + /** * Expands the archive's time range based to encompass the given time range * @param begin_timestamp @@ -91,8 +98,6 @@ class ArchiveMetadata { private: // Variables archive_format_version_t m_archive_format_version{cArchiveFormatVersion}; - std::string m_variable_encoding_methods_version{}; - std::string m_variables_schema_version{}; std::string m_creator_id; uint16_t m_creator_id_len{0}; uint64_t m_creation_idx{0}; @@ -104,6 +109,12 @@ class ArchiveMetadata { // The size of the archive uint64_t m_compressed_size{0}; uint64_t m_dynamic_compressed_size{0}; + // TODO: The following fields are used in single-file archive; however, they are not + // currently part of multi-file archive metadata. Modifying multi-file archive metadata + // disk format is potentially a breaking change and not currently required. + std::string m_variable_encoding_methods_version{ffi::cVariableEncodingMethodsVersion}; + std::string m_variables_schema_version{ffi::cVariablesSchemaVersion}; + std::string m_compression_type{"ZSTD"}; }; } // namespace clp::streaming_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 99555d309..7ac7d16a6 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -4,10 +4,10 @@ #include #include +#include "../ArchiveMetadata.hpp" #include "../clp/Defs.h" #include "../Constants.hpp" #include "msgpack.hpp" -#include "streaming_archive/ArchiveMetadata.hpp" namespace clp::streaming_archive::single_file_archive { @@ -15,32 +15,27 @@ using single_file_archive_format_version_t = uint32_t; // Single file archive version. constexpr uint8_t cArchiveMajorVersion{0}; -constexpr uint8_t cArchiveMinorVersion{2}; -constexpr uint16_t cArchivePatchVersion{0}; +constexpr uint8_t cArchiveMinorVersion{1}; +constexpr uint16_t cArchivePatchVersion{1}; constexpr single_file_archive_format_version_t cArchiveVersion{ cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion }; static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; - static constexpr std::string_view cUnstructuredSfaExtension = ".clp"; - -enum class CompressionType : uint16_t { - Passthrough = 0, - Zstd, - LZMA -}; - static constexpr std::string_view cCompressionTypeZstd = "ZSTD"; +static constexpr size_t cMagicNumberSize = 4; +static constexpr size_t cUnusedSize = 4; + struct __attribute__((packed)) SingleFileArchiveHeader { - uint8_t magic[4]; + std::array magic; single_file_archive_format_version_t version; uint64_t metadata_size; - uint64_t unused[6]; + std::array unused; }; -constexpr char const* static_archive_file_names[] +constexpr std::array cStaticArchiveFileNames = {cMetadataDBFileName, cLogTypeDictFilename, cLogTypeSegmentIndexFilename, diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 212c367af..1c9d5e51a 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -9,7 +9,6 @@ #include #include "../../ErrorCode.hpp" -#include "../../ffi/encoding_methods.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" #include "../Constants.hpp" @@ -91,7 +90,7 @@ auto get_file_infos( std::vector files; uint64_t offset = 0; - for (auto const& static_archive_file_name : static_archive_file_names) { + for (auto const& static_archive_file_name : cStaticArchiveFileNames) { files.emplace_back(static_archive_file_name, offset); update_offset(multi_file_archive_path / static_archive_file_name, offset); } @@ -105,33 +104,20 @@ auto get_file_infos( // Add sentinel indicating total size of all files. files.emplace_back(FileInfo{"", offset}); - // Print the file paths and offsets to ensure it works - // Remove this later - for (auto const& file : files) { - std::cout << "File: " << file.n << ", Offset: " << file.o << std::endl; - } - return files; } auto pack_single_file_archive_metadata( clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, - uint64_t const num_segments + std::vector const& segment_ids ) -> std::stringstream { ArchiveMetadata archive_metadata{ .archive_format_version = multi_file_archive_metadata.get_archive_format_version(), - // TODO: The following fields are required for single-file archive metadata format; - // however, they are not yet implemented in multi-file archive metadata. Currently, - // they are set to default values, but should reference the actual values from the - // multi-file archive metadata once implemented. - // - variable_encoding_methods_version - // - variables_schema_version - // - compression_type .variable_encoding_methods_version - = static_cast(ffi::cVariableEncodingMethodsVersion), - .variables_schema_version = static_cast(ffi::cVariablesSchemaVersion), - .compression_type = std::string(cCompressionTypeZstd), + = multi_file_archive_metadata.get_variable_encoding_methods_version(), + .variables_schema_version = multi_file_archive_metadata.get_variables_schema_version(), + .compression_type = multi_file_archive_metadata.get_compression_type(), .creator_id = multi_file_archive_metadata.get_creator_id(), .begin_timestamp = multi_file_archive_metadata.get_begin_timestamp(), .end_timestamp = multi_file_archive_metadata.get_end_timestamp(), @@ -142,7 +128,7 @@ auto pack_single_file_archive_metadata( SingleFileArchiveMetadata single_file_archive{ .archive_files = file_infos, .archive_metadata = archive_metadata, - .num_segments = num_segments, + .num_segments = segment_ids.size(), }; std::stringstream buf; @@ -151,6 +137,15 @@ auto pack_single_file_archive_metadata( return buf; } +auto create_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::stringstream { + auto file_infos = get_file_infos(multi_file_archive_path, segment_ids); + return pack_single_file_archive_metadata(multi_file_archive_metadata, file_infos, segment_ids); +} + auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void { SingleFileArchiveHeader header{ .magic{}, @@ -174,7 +169,7 @@ auto write_archive_files( std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> void { - for (auto const& file_name : static_archive_file_names) { + for (auto const& file_name : cStaticArchiveFileNames) { std::filesystem::path static_archive_file_path = multi_file_archive_path / file_name; write_archive_file(static_archive_file_path, archive_writer); } diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index e302c872f..b837b01be 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -41,6 +41,12 @@ class OperationFailed : public TraceableException { */ auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; +auto create_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::stringstream; + /** * @param multi_file_archive_path Path to the multi-file archive. * @param segment_ids Vector of segment IDs. @@ -57,13 +63,13 @@ auto get_file_infos( * * @param multi_file_archive_metadata Multi-file archive metadata. * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @param num_segments + * @param segment_ids Vector of segment IDs. * @return Packed metadata. */ auto pack_single_file_archive_metadata( clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, - uint64_t num_segments + std::vector const& segment_ids ) -> std::stringstream; /** diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 9ea195760..6cb04e6be 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -756,18 +756,15 @@ void Archive::write_single_file_archive() { auto segment_ids = single_file_archive::get_segment_ids(m_next_segment_id - 1); - // Put this all together - auto file_infos = single_file_archive::get_file_infos(multi_file_archive_path, segment_ids); - if (false == m_local_metadata.has_value()) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } auto& multi_file_archive_metadata = m_local_metadata.value(); - auto packed_metadata = single_file_archive::pack_single_file_archive_metadata( + auto packed_metadata = single_file_archive::create_single_file_archive_metadata( multi_file_archive_metadata, - file_infos, - m_next_segment_id + multi_file_archive_path, + segment_ids ); msgpack::object_handle oh From 869f1b3e6426bfb4c004c086f9398a9122255745 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 16:44:13 +0000 Subject: [PATCH 05/35] refactor --- components/core/src/clp/clp/CMakeLists.txt | 3 + .../single_file_archive/writer.cpp | 168 +++++++++++++----- .../single_file_archive/writer.hpp | 61 ++----- .../clp/streaming_archive/writer/Archive.cpp | 128 +------------ .../clp/streaming_archive/writer/Archive.hpp | 4 +- 5 files changed, 149 insertions(+), 215 deletions(-) diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt index eff32ce46..c241e3bed 100644 --- a/components/core/src/clp/clp/CMakeLists.txt +++ b/components/core/src/clp/clp/CMakeLists.txt @@ -108,6 +108,9 @@ set( ../streaming_archive/reader/Segment.hpp ../streaming_archive/reader/SegmentManager.cpp ../streaming_archive/reader/SegmentManager.hpp + ../streaming_archive/single_file_archive/Defs.hpp + ../streaming_archive/single_file_archive/writer.cpp + ../streaming_archive/single_file_archive/writer.hpp ../streaming_archive/writer/Archive.cpp ../streaming_archive/writer/Archive.hpp ../streaming_archive/writer/File.cpp diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 1c9d5e51a..385bab0c9 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -28,6 +28,31 @@ constexpr size_t cReadBlockSize = 4096; */ void update_offset(std::filesystem::path const& file_path, uint64_t& offset); +/** + * @param multi_file_archive_path Path to the multi-file archive. + * @param segment_ids Vector of segment IDs. + * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. + * @throws OperationFailed if error getting file size. + */ +auto get_file_infos( + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::vector; + +/** + * Generates single-file archive metadata then serializes into MsgPack. + * + * @param multi_file_archive_metadata + * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. + * @param segment_ids + * @return Packed metadata. + */ +auto pack_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::vector const& file_infos, + std::vector const& segment_ids +) -> std::stringstream; + /** * Reads the content of a file and writes it to the archive. * @param file_path Path to the file to be read. @@ -36,6 +61,38 @@ void update_offset(std::filesystem::path const& file_path, uint64_t& offset); */ void write_archive_file(std::string const& file_path, FileWriter& archive_writer); +/** + * Writes single-file archive header. + * + * @param archive_writer + * @param metadata_size + */ +auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void; + +/** + * Writes single-file archive metadata. + * + * @param archive_writer + * @param packed_metadata Packed metadata. + */ +auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) + -> void; + +/** + * Iterates over files in the multi-file archive copying their contents to the single-file archive. + * Skips metadata since already written in `write_archive_metadata`. + * + * @param archive_writer + * @param multi_file_archive_path + * @param segment_ids + * @throws OperationFailed if reading a file fails. + */ +auto write_archive_files( + FileWriter& archive_writer, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> void; + void update_offset(std::filesystem::path const& file_path, uint64_t& offset) { try { auto size = std::filesystem::file_size(file_path); @@ -50,39 +107,6 @@ void update_offset(std::filesystem::path const& file_path, uint64_t& offset) { } } -auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) - -> void { - FileReader reader(file_path.string()); - std::array read_buffer{}; - while (true) { - size_t num_bytes_read{0}; - ErrorCode const error_code - = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); - if (ErrorCode_EndOfFile == error_code) { - break; - } - if (ErrorCode_Success != error_code) { - throw OperationFailed(error_code, __FILENAME__, __LINE__); - } - archive_writer.write(read_buffer.data(), num_bytes_read); - } -} -} // namespace - -auto get_segment_ids(segment_id_t last_segment_id) -> std::vector { - std::vector segment_ids; - - if (last_segment_id < 0) { - return segment_ids; - } - - for (size_t i = 0; i <= last_segment_id; ++i) { - segment_ids.emplace_back(std::to_string(i)); - } - - return segment_ids; -} - auto get_file_infos( std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids @@ -137,13 +161,22 @@ auto pack_single_file_archive_metadata( return buf; } -auto create_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, - std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids -) -> std::stringstream { - auto file_infos = get_file_infos(multi_file_archive_path, segment_ids); - return pack_single_file_archive_metadata(multi_file_archive_metadata, file_infos, segment_ids); +auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) + -> void { + FileReader reader(file_path.string()); + std::array read_buffer{}; + while (true) { + size_t num_bytes_read{0}; + ErrorCode const error_code + = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); + if (ErrorCode_EndOfFile == error_code) { + break; + } + if (ErrorCode_Success != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + archive_writer.write(read_buffer.data(), num_bytes_read); + } } auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void { @@ -180,4 +213,59 @@ auto write_archive_files( write_archive_file(segment_path, archive_writer); } } +} // namespace + +auto get_segment_ids(segment_id_t last_segment_id) -> std::vector { + std::vector segment_ids; + + if (last_segment_id < 0) { + return segment_ids; + } + + for (size_t i = 0; i <= last_segment_id; ++i) { + segment_ids.emplace_back(std::to_string(i)); + } + + return segment_ids; +} + +auto create_single_file_archive_metadata( + clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + std::filesystem::path const& multi_file_archive_path, + std::vector const& segment_ids +) -> std::stringstream { + auto file_infos = get_file_infos(multi_file_archive_path, segment_ids); + return pack_single_file_archive_metadata(multi_file_archive_metadata, file_infos, segment_ids); +} + +void write_single_file_archive( + std::filesystem::path const& multi_file_archive_path, + std::stringstream const& packed_metadata, + std::vector const& segment_ids +) { + FileWriter archive_writer; + std::filesystem::path single_file_archive_path + = multi_file_archive_path.string() + + std::string(single_file_archive::cUnstructuredSfaExtension); + + if (std::filesystem::exists(single_file_archive_path)) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + archive_writer.open( + single_file_archive_path.string(), + FileWriter::OpenMode::CREATE_FOR_WRITING + ); + + write_archive_header(archive_writer, packed_metadata.str().size()); + write_archive_metadata(archive_writer, packed_metadata); + write_archive_files(archive_writer, multi_file_archive_path, segment_ids); + + archive_writer.close(); + try { + std::filesystem::remove_all(multi_file_archive_path); + } catch (std::filesystem::filesystem_error& e) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } +} } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index b837b01be..7333d43d6 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -41,68 +41,33 @@ class OperationFailed : public TraceableException { */ auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; -auto create_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, - std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids -) -> std::stringstream; - -/** - * @param multi_file_archive_path Path to the multi-file archive. - * @param segment_ids Vector of segment IDs. - * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @throws OperationFailed if error getting file size. - */ -auto get_file_infos( - std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids -) -> std::vector; - /** - * Serializes single-file archive metadata into MsgPack. + * Generates single-file archive metadata then serializes into MsgPack. * - * @param multi_file_archive_metadata Multi-file archive metadata. - * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @param segment_ids Vector of segment IDs. + * @param multi_file_archive_metadata + * @param multi_file_archive_path + * @param segment_ids * @return Packed metadata. */ -auto pack_single_file_archive_metadata( +auto create_single_file_archive_metadata( clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, - std::vector const& file_infos, + std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> std::stringstream; /** - * Writes single-file archive header. - * - * @param archive_writer - * @param metadata_size - */ -auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void; - -/** - * Writes single-file archive metadata. - * - * @param archive_writer - * @param packed_metadata Packed metadata. - */ -auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) - -> void; - -/** - * Iterates over files in the multi-file archive copying their contents to the single-file archive. - * Skips metadata since already written in `write_archive_metadata`. + * Writes the single-file archive to disk. * - * @param archive_writer * @param multi_file_archive_path - * @param segment_ids Vector of segment IDs. - * @throws OperationFailed if reading a file fails. + * @param packed_metadata + * @param segment_ids + * @throws OperationFailed if writing the archive fails. */ -auto write_archive_files( - FileWriter& archive_writer, +void write_single_file_archive( std::filesystem::path const& multi_file_archive_path, + std::stringstream const& packed_metadata, std::vector const& segment_ids -) -> void; +); } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 6cb04e6be..72ea2ae8f 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -248,7 +248,7 @@ void Archive::close() { m_metadata_db.close(); if (m_use_single_file_archive) { - write_single_file_archive(); + create_single_file_archive(); } m_creator_id_as_string.clear(); @@ -661,97 +661,7 @@ void Archive::update_metadata() { } } -void print_msgpack_object(msgpack::object const& obj, int depth = 0) { - // Indentation for better readability - std::string indent(depth * 2, ' '); - - switch (obj.type) { - case msgpack::type::NIL: - std::cout << indent << "NIL" << std::endl; - break; - case msgpack::type::BOOLEAN: - std::cout << indent << "BOOLEAN: " << (obj.as() ? "true" : "false") << std::endl; - break; - case msgpack::type::POSITIVE_INTEGER: - case msgpack::type::NEGATIVE_INTEGER: - std::cout << indent << "Integer: " << obj.as() << std::endl; - break; - case msgpack::type::FLOAT32: - case msgpack::type::FLOAT64: - std::cout << indent << "Float: " << obj.as() << std::endl; - break; - case msgpack::type::STR: - std::cout << indent << "String: " << obj.as() << std::endl; - break; - case msgpack::type::BIN: - std::cout << indent << "Binary (size " << obj.via.bin.size << "): "; - for (size_t i = 0; i < obj.via.bin.size; ++i) { - std::cout << std::hex << static_cast(obj.via.bin.ptr[i]) << " "; - } - std::cout << std::dec << std::endl; - break; - case msgpack::type::ARRAY: - std::cout << indent << "Array (size " << obj.via.array.size << "):" << std::endl; - for (size_t i = 0; i < obj.via.array.size; ++i) { - print_msgpack_object( - obj.via.array.ptr[i], - depth + 1 - ); // Recursively print array elements - } - break; - case msgpack::type::MAP: - std::cout << indent << "Map (size " << obj.via.map.size << "):" << std::endl; - for (size_t i = 0; i < obj.via.map.size; ++i) { - // Print key - std::cout << indent << " Key " << i + 1 << ": "; - print_msgpack_object(obj.via.map.ptr[i].key, depth + 1); - - // Print value - std::cout << indent << " Value " << i + 1 << ": "; - print_msgpack_object(obj.via.map.ptr[i].val, depth + 1); - } - break; - default: - std::cout << indent << "Unknown type" << std::endl; - break; - } -} - -void unpackMetadata(std::string const& packed_data) { - msgpack::object_handle oh = msgpack::unpack(packed_data.data(), packed_data.size()); - msgpack::object obj = oh.get(); - print_msgpack_object(obj); - - single_file_archive::SingleFileArchiveMetadata metadata; - obj.convert(metadata); - - // Print archive files - std::cout << "\nArchive Files:" << std::endl; - for (auto const& file : metadata.archive_files) { - std::cout << " Name: " << file.n << ", Offset: " << file.o << std::endl; - } - - // Print archive metadata - std::cout << "\nArchive Metadata:" << std::endl; - std::cout << " Archive Format Version: " << metadata.archive_metadata.archive_format_version - << std::endl; - std::cout << " Begin Timestamp: " << metadata.archive_metadata.begin_timestamp << std::endl; - std::cout << " Compressed Size: " << metadata.archive_metadata.compressed_size << std::endl; - std::cout << " Compression Type: " << metadata.archive_metadata.compression_type << std::endl; - std::cout << " Creator ID: " << metadata.archive_metadata.creator_id << std::endl; - std::cout << " End Timestamp: " << metadata.archive_metadata.end_timestamp << std::endl; - std::cout << " Uncompressed Size: " << metadata.archive_metadata.uncompressed_size - << std::endl; - std::cout << " Variable Encoding Methods Version: " - << metadata.archive_metadata.variable_encoding_methods_version << std::endl; - std::cout << " Variables Schema Version: " - << metadata.archive_metadata.variables_schema_version << std::endl; - - // Print num segments - std::cout << "\nNum Segments: " << metadata.num_segments << std::endl; -} - -void Archive::write_single_file_archive() { +void Archive::create_single_file_archive() { std::filesystem::path multi_file_archive_path = m_path; auto segment_ids = single_file_archive::get_segment_ids(m_next_segment_id - 1); @@ -767,39 +677,7 @@ void Archive::write_single_file_archive() { segment_ids ); - msgpack::object_handle oh - = msgpack::unpack(packed_metadata.str().data(), packed_metadata.str().size()); - msgpack::object obj = oh.get(); - print_msgpack_object(obj); - - // Return packed metadata - - // Put this all together - FileWriter archive_writer; - std::filesystem::path single_file_archive_path - = multi_file_archive_path.string() - + std::string(single_file_archive::cUnstructuredSfaExtension); - - if (std::filesystem::exists(single_file_archive_path)) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - archive_writer.open( - single_file_archive_path.string(), - FileWriter::OpenMode::CREATE_FOR_WRITING - ); - - single_file_archive::write_archive_header(archive_writer, packed_metadata.str().size()); - single_file_archive::write_archive_metadata(archive_writer, packed_metadata); - single_file_archive::write_archive_files(archive_writer, multi_file_archive_path, segment_ids); - - archive_writer.close(); - try { - // std::filesystem::remove_all(multi_file_archive_path); - } catch (std::filesystem::filesystem_error& e) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - // Here + single_file_archive::write_single_file_archive(multi_file_archive_path, packed_metadata, segment_ids); } // Explicitly declare template specializations so that we can define the template methods in this diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index b5ba0cafd..b4a08f3a2 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -283,9 +283,9 @@ class Archive { void update_metadata(); /** - * Writes the archive to a single file. + * Writes archive to disk in single-file format then removes existing multi-file archive. */ - void write_single_file_archive(); + void create_single_file_archive(); // Variables boost::uuids::uuid m_id; From d84c0020083365c7f596aa3f1ee0b2a169e193ad Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 19:26:06 +0000 Subject: [PATCH 06/35] refactor --- .../single_file_archive/Defs.hpp | 29 ++++++++--------- .../single_file_archive/writer.cpp | 31 ++++++++++--------- .../single_file_archive/writer.hpp | 17 +++++----- .../clp/streaming_archive/writer/Archive.cpp | 7 ++--- .../clp/streaming_archive/writer/utils.cpp | 5 ++- .../clp/streaming_archive/writer/utils.hpp | 3 ++ 6 files changed, 49 insertions(+), 43 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 7ac7d16a6..af605249e 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -4,10 +4,10 @@ #include #include -#include "../ArchiveMetadata.hpp" #include "../clp/Defs.h" #include "../Constants.hpp" #include "msgpack.hpp" +#include "../ArchiveMetadata.hpp" namespace clp::streaming_archive::single_file_archive { @@ -21,26 +21,27 @@ constexpr single_file_archive_format_version_t cArchiveVersion{ cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion }; -static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; +static constexpr size_t cNumMagicNumberChars = 4; +static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; static constexpr std::string_view cUnstructuredSfaExtension = ".clp"; static constexpr std::string_view cCompressionTypeZstd = "ZSTD"; - -static constexpr size_t cMagicNumberSize = 4; -static constexpr size_t cUnusedSize = 4; +static constexpr size_t cNumUnused = 4; struct __attribute__((packed)) SingleFileArchiveHeader { - std::array magic; + std::array magic; single_file_archive_format_version_t version; uint64_t metadata_size; - std::array unused; + std::array unused; }; -constexpr std::array cStaticArchiveFileNames - = {cMetadataDBFileName, - cLogTypeDictFilename, - cLogTypeSegmentIndexFilename, - cVarDictFilename, - cVarSegmentIndexFilename}; +static constexpr size_t cNumStaticFiles = 5; +constexpr std::array cStaticArchiveFileNames = { + cMetadataDBFileName, + cLogTypeDictFilename, + cLogTypeSegmentIndexFilename, + cVarDictFilename, + cVarSegmentIndexFilename +}; struct FileInfo { std::string n; @@ -49,7 +50,7 @@ struct FileInfo { }; struct ArchiveMetadata { - single_file_archive_format_version_t archive_format_version; + archive_format_version_t archive_format_version; std::string variable_encoding_methods_version; std::string variables_schema_version; std::string compression_type; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 385bab0c9..c776f108f 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include @@ -17,22 +16,25 @@ namespace clp::streaming_archive::single_file_archive { namespace { - constexpr size_t cReadBlockSize = 4096; /** * Gets the size of a file specified by `file_path` and adds it to `offset`. * @param file_path - * @param offset The current offset of the single file archive. + * @param[out] offset File section offset for the single-file archive. The returned offset + * represents the starting position of the next file in single-file archive. * @throws OperationFailed if error getting file size. */ void update_offset(std::filesystem::path const& file_path, uint64_t& offset); /** - * @param multi_file_archive_path Path to the multi-file archive. - * @param segment_ids Vector of segment IDs. + * Generates metadata for the file section of a single-file archive. The metadata consists + * of a list of file names and their corresponding starting offsets. + * + * @param multi_file_archive_path + * @param segment_ids * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @throws OperationFailed if error getting file size. + * @throws Propagates `update_offset`'s exceptions. */ auto get_file_infos( std::filesystem::path const& multi_file_archive_path, @@ -40,7 +42,8 @@ auto get_file_infos( ) -> std::vector; /** - * Generates single-file archive metadata then serializes into MsgPack. +* Combines file section metadata, multi-file archive metadata, and the number of segments into single-file archive metadata. +* Once combined, serializes the metadata into MsgPack format. * * @param multi_file_archive_metadata * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. @@ -54,9 +57,9 @@ auto pack_single_file_archive_metadata( ) -> std::stringstream; /** - * Reads the content of a file and writes it to the archive. - * @param file_path Path to the file to be read. - * @param archive_writer Writer to write the file content to the archive. + * Reads the content of a file and writes it to the single-file archive. + * @param file_path + * @param archive_writer * @throws OperationFailed if reading the file fails. */ void write_archive_file(std::string const& file_path, FileWriter& archive_writer); @@ -65,7 +68,7 @@ void write_archive_file(std::string const& file_path, FileWriter& archive_writer * Writes single-file archive header. * * @param archive_writer - * @param metadata_size + * @param packed_metadata_size */ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void; @@ -76,16 +79,16 @@ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_siz * @param packed_metadata Packed metadata. */ auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) - -> void; + -> void; /** * Iterates over files in the multi-file archive copying their contents to the single-file archive. - * Skips metadata since already written in `write_archive_metadata`. + * Skips metadata file since already written in `write_archive_metadata`. * * @param archive_writer * @param multi_file_archive_path * @param segment_ids - * @throws OperationFailed if reading a file fails. + * @throws Propagates `update_offset`'s exceptions. */ auto write_archive_files( FileWriter& archive_writer, diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 7333d43d6..38aeef31e 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -1,5 +1,5 @@ -#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP -#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP +#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP +#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP #include #include @@ -8,10 +8,8 @@ #include #include "../../ErrorCode.hpp" -#include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" -#include "Defs.hpp" namespace clp::streaming_archive::single_file_archive { @@ -36,13 +34,13 @@ class OperationFailed : public TraceableException { }; /** - * @param last_segment_id ID of last written segment. + * @param last_segment_id ID of last written segment in archive. * @return Vector of segment IDs. */ auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; /** - * Generates single-file archive metadata then serializes into MsgPack. + * Generates packed single-file archive metadata. * * @param multi_file_archive_metadata * @param multi_file_archive_path @@ -56,12 +54,13 @@ auto create_single_file_archive_metadata( ) -> std::stringstream; /** - * Writes the single-file archive to disk. + * Writes header, metadata and archive files in single-file format then + * removes existing multi-file archive. * * @param multi_file_archive_path * @param packed_metadata * @param segment_ids - * @throws OperationFailed if writing the archive fails. + * @throws OperationFailed if single-file archive path already exists. */ void write_single_file_archive( std::filesystem::path const& multi_file_archive_path, @@ -71,4 +70,4 @@ void write_single_file_archive( } // namespace clp::streaming_archive::single_file_archive -#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_UTILS_HPP +#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_WRITER_HPP diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 72ea2ae8f..5cbadbe00 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -229,7 +229,6 @@ void Archive::close() { // Persist all metadata including dictionaries write_dir_snapshot(); - // I dont understand this code we already closed in write dir snapshot? m_logtype_dict.close(); m_logtype_dict_entry.clear(); m_var_dict.close(); @@ -569,9 +568,7 @@ void Archive::persist_file_metadata(vector const& files) { m_metadata_db.update_files(files); - if (false == m_use_single_file_archive) { - m_global_metadata_db->update_metadata_for_files(m_id_as_string, files); - } + m_global_metadata_db->update_metadata_for_files(m_id_as_string, files); // Mark files' metadata as clean for (auto file : files) { @@ -669,8 +666,8 @@ void Archive::create_single_file_archive() { if (false == m_local_metadata.has_value()) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - auto& multi_file_archive_metadata = m_local_metadata.value(); + auto& multi_file_archive_metadata = m_local_metadata.value(); auto packed_metadata = single_file_archive::create_single_file_archive_metadata( multi_file_archive_metadata, multi_file_archive_path, diff --git a/components/core/src/clp/streaming_archive/writer/utils.cpp b/components/core/src/clp/streaming_archive/writer/utils.cpp index 726175e65..983f61b24 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.cpp +++ b/components/core/src/clp/streaming_archive/writer/utils.cpp @@ -13,7 +13,10 @@ using std::string; namespace clp::streaming_archive::writer { auto split_archive(Archive::UserConfig& archive_user_config, Archive& archive_writer) -> void { if (archive_writer.get_use_single_file_archive()) { - SPDLOG_WARN("Single-file archive cannot be split."); + SPDLOG_WARN( + "Target size of dictionaries exceeded. " + "Single-file archive cannot be split into multiple archives." + ); return; } archive_writer.close(); diff --git a/components/core/src/clp/streaming_archive/writer/utils.hpp b/components/core/src/clp/streaming_archive/writer/utils.hpp index be56d79da..606c72a1a 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.hpp +++ b/components/core/src/clp/streaming_archive/writer/utils.hpp @@ -10,6 +10,9 @@ namespace clp::streaming_archive::writer { /** * Closes the current archive and starts a new one + * + * NOTE: This function has no effect on single-file archives + * * @param archive_user_config * @param archive_writer */ From f4136f8fe1d365a6b6df2e4e2090db995ab581d7 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 20:14:56 +0000 Subject: [PATCH 07/35] refactor --- .../clp/streaming_archive/ArchiveMetadata.hpp | 5 +- .../single_file_archive/Defs.hpp | 22 +++--- .../single_file_archive/writer.cpp | 70 +++++++++---------- .../single_file_archive/writer.hpp | 4 +- 4 files changed, 51 insertions(+), 50 deletions(-) diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index c31b8318e..e81aa88d3 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -10,6 +10,9 @@ #include "../ffi/encoding_methods.hpp" namespace clp::streaming_archive { + +static constexpr std::string_view cCompressionTypeZstd = "ZSTD"; + /** * A class to encapsulate metadata directly relating to an archive. */ @@ -114,7 +117,7 @@ class ArchiveMetadata { // disk format is potentially a breaking change and not currently required. std::string m_variable_encoding_methods_version{ffi::cVariableEncodingMethodsVersion}; std::string m_variables_schema_version{ffi::cVariablesSchemaVersion}; - std::string m_compression_type{"ZSTD"}; + std::string m_compression_type{cCompressionTypeZstd}; }; } // namespace clp::streaming_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index af605249e..b44118b98 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -7,7 +7,6 @@ #include "../clp/Defs.h" #include "../Constants.hpp" #include "msgpack.hpp" -#include "../ArchiveMetadata.hpp" namespace clp::streaming_archive::single_file_archive { @@ -24,15 +23,6 @@ constexpr single_file_archive_format_version_t cArchiveVersion{ static constexpr size_t cNumMagicNumberChars = 4; static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; static constexpr std::string_view cUnstructuredSfaExtension = ".clp"; -static constexpr std::string_view cCompressionTypeZstd = "ZSTD"; -static constexpr size_t cNumUnused = 4; - -struct __attribute__((packed)) SingleFileArchiveHeader { - std::array magic; - single_file_archive_format_version_t version; - uint64_t metadata_size; - std::array unused; -}; static constexpr size_t cNumStaticFiles = 5; constexpr std::array cStaticArchiveFileNames = { @@ -43,13 +33,21 @@ constexpr std::array cStaticArchiveFileNames = { cVarSegmentIndexFilename }; +static constexpr size_t cNumUnused = 4; +struct __attribute__((packed)) SingleFileArchiveHeader { + std::array magic; + single_file_archive_format_version_t version; + uint64_t metadata_size; + std::array unused; +}; + struct FileInfo { std::string n; uint64_t o; MSGPACK_DEFINE_MAP(n, o); }; -struct ArchiveMetadata { +struct MultiFileArchiveMetadata { archive_format_version_t archive_format_version; std::string variable_encoding_methods_version; std::string variables_schema_version; @@ -74,7 +72,7 @@ struct ArchiveMetadata { struct SingleFileArchiveMetadata { std::vector archive_files; - ArchiveMetadata archive_metadata; + MultiFileArchiveMetadata archive_metadata; uint64_t num_segments; MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments); }; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index c776f108f..e4d32b94f 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -19,7 +19,7 @@ namespace { constexpr size_t cReadBlockSize = 4096; /** - * Gets the size of a file specified by `file_path` and adds it to `offset`. + * Gets the size of a file specified by `file_path` and adds it to file section `offset`. * @param file_path * @param[out] offset File section offset for the single-file archive. The returned offset * represents the starting position of the next file in single-file archive. @@ -42,8 +42,8 @@ auto get_file_infos( ) -> std::vector; /** -* Combines file section metadata, multi-file archive metadata, and the number of segments into single-file archive metadata. -* Once combined, serializes the metadata into MsgPack format. + * Combines file section metadata, multi-file archive metadata, and the number of segments into single-file archive metadata. + * Once combined, serializes the metadata into MsgPack format. * * @param multi_file_archive_metadata * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. @@ -51,19 +51,11 @@ auto get_file_infos( * @return Packed metadata. */ auto pack_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, std::vector const& segment_ids ) -> std::stringstream; -/** - * Reads the content of a file and writes it to the single-file archive. - * @param file_path - * @param archive_writer - * @throws OperationFailed if reading the file fails. - */ -void write_archive_file(std::string const& file_path, FileWriter& archive_writer); - /** * Writes single-file archive header. * @@ -81,6 +73,14 @@ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_siz auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) -> void; +/** + * Reads the content of a file and writes it to the single-file archive. + * @param file_path + * @param archive_writer + * @throws OperationFailed if reading the file fails. + */ +void write_archive_file(std::string const& file_path, FileWriter& archive_writer); + /** * Iterates over files in the multi-file archive copying their contents to the single-file archive. * Skips metadata file since already written in `write_archive_metadata`. @@ -135,11 +135,11 @@ auto get_file_infos( } auto pack_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, std::vector const& segment_ids ) -> std::stringstream { - ArchiveMetadata archive_metadata{ + MultiFileArchiveMetadata archive_metadata{ .archive_format_version = multi_file_archive_metadata.get_archive_format_version(), .variable_encoding_methods_version = multi_file_archive_metadata.get_variable_encoding_methods_version(), @@ -164,24 +164,6 @@ auto pack_single_file_archive_metadata( return buf; } -auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) - -> void { - FileReader reader(file_path.string()); - std::array read_buffer{}; - while (true) { - size_t num_bytes_read{0}; - ErrorCode const error_code - = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); - if (ErrorCode_EndOfFile == error_code) { - break; - } - if (ErrorCode_Success != error_code) { - throw OperationFailed(error_code, __FILENAME__, __LINE__); - } - archive_writer.write(read_buffer.data(), num_bytes_read); - } -} - auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void { SingleFileArchiveHeader header{ .magic{}, @@ -200,13 +182,31 @@ auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); } +auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) + -> void { + FileReader reader(file_path.string()); + std::array read_buffer{}; + while (true) { + size_t num_bytes_read{}; + ErrorCode const error_code + = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); + if (ErrorCode_EndOfFile == error_code) { + break; + } + if (ErrorCode_Success != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + archive_writer.write(read_buffer.data(), num_bytes_read); + } +} + auto write_archive_files( FileWriter& archive_writer, std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> void { - for (auto const& file_name : cStaticArchiveFileNames) { - std::filesystem::path static_archive_file_path = multi_file_archive_path / file_name; + for (auto const& static_archive_file_name : cStaticArchiveFileNames) { + std::filesystem::path static_archive_file_path = multi_file_archive_path / static_archive_file_name; write_archive_file(static_archive_file_path, archive_writer); } @@ -233,7 +233,7 @@ auto get_segment_ids(segment_id_t last_segment_id) -> std::vector { } auto create_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> std::stringstream { diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 38aeef31e..b9ad3b17b 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -21,7 +21,7 @@ class OperationFailed : public TraceableException { char const* const filename, int line_number, std::string message - = "streaming_archive::writer::single_file_archive::utils operation failed" + = "streaming_archive::single_file_archive operation failed" ) : TraceableException{error_code, filename, line_number}, m_message{std::move(message)} {} @@ -48,7 +48,7 @@ auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; * @return Packed metadata. */ auto create_single_file_archive_metadata( - clp::streaming_archive::ArchiveMetadata const& multi_file_archive_metadata, + ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> std::stringstream; From 6bbf12e6bf78d9b64e7fe1855d04179752c754e9 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 23:09:13 +0000 Subject: [PATCH 08/35] refactor --- components/core/src/clp/clp/FileCompressor.cpp | 9 ++++++--- components/core/src/clp/clp/compression.cpp | 6 ++++-- .../streaming_archive/single_file_archive/Defs.hpp | 13 +++++++------ .../single_file_archive/writer.cpp | 12 ++++++++++++ .../src/clp/streaming_archive/writer/Archive.cpp | 3 ++- .../core/src/clp/streaming_archive/writer/utils.cpp | 7 ------- .../core/src/clp/streaming_archive/writer/utils.hpp | 2 -- 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 9898602cc..6a1b02b24 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -243,7 +243,8 @@ void FileCompressor::parse_and_encode_with_heuristic( // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive_writer.get_use_single_file_archive()) { split_file_and_archive( archive_user_config, path_for_compression, @@ -337,7 +338,8 @@ bool FileCompressor::try_compressing_as_archive( parent_directories.emplace(file_parent_path); } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive_writer.get_use_single_file_archive()) { split_archive(archive_user_config, archive_writer); } @@ -537,7 +539,8 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( } // Split archive/encoded file if necessary before writing the new event - if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts + && false == archive.get_use_single_file_archive()) { split_file_and_archive( archive_user_config, path, diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index 88bec8784..a2a22dac1 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -136,7 +136,8 @@ bool compress( ); } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries + && false == archive_writer.get_use_single_file_archive()) { split_archive(archive_user_config, archive_writer); } if (false @@ -164,7 +165,8 @@ bool compress( file_group_id_comparator); // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries + && false == archive_writer.get_use_single_file_archive()) { split_archive(archive_user_config, archive_writer); } if (false diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index b44118b98..d7ad86cf9 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -20,12 +20,13 @@ constexpr single_file_archive_format_version_t cArchiveVersion{ cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion }; -static constexpr size_t cNumMagicNumberChars = 4; -static constexpr std::array cUnstructuredSfaMagicNumber = {'Y', 'C', 'L', 'P'}; -static constexpr std::string_view cUnstructuredSfaExtension = ".clp"; +static constexpr size_t cNumMagicNumberChars{4}; +static constexpr std::array cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; +static constexpr std::string_view cUnstructuredSfaExtension{".clp"}; +static constexpr size_t cFileSizeWarningThreshold{100L * 1024 * 1024}; -static constexpr size_t cNumStaticFiles = 5; -constexpr std::array cStaticArchiveFileNames = { +static constexpr size_t cNumStaticFiles{5}; +constexpr std::array cStaticArchiveFileNames{ cMetadataDBFileName, cLogTypeDictFilename, cLogTypeSegmentIndexFilename, @@ -33,7 +34,7 @@ constexpr std::array cStaticArchiveFileNames = { cVarSegmentIndexFilename }; -static constexpr size_t cNumUnused = 4; +static constexpr size_t cNumUnused{6}; struct __attribute__((packed)) SingleFileArchiveHeader { std::array magic; single_file_archive_format_version_t version; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index e4d32b94f..f685d7622 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -131,6 +132,17 @@ auto get_file_infos( // Add sentinel indicating total size of all files. files.emplace_back(FileInfo{"", offset}); + // Decompression of large single-file archives will consume excessive memory since + // single-file archives are not split. + if (offset > cFileSizeWarningThreshold) { + SPDLOG_WARN( + "Single file archive size exceeded {}. " + "The single-file archive format is not intended for large archives, " + " consider using multi-file archive format instead.", + cFileSizeWarningThreshold + ); + } + return files; } diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 5cbadbe00..604826756 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -338,7 +338,8 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { m_old_ts_pattern = timestamp_pattern; } } - if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { + if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts + && false == m_use_single_file_archive) { split_file_and_archive( m_archive_user_config, m_path_for_compression, diff --git a/components/core/src/clp/streaming_archive/writer/utils.cpp b/components/core/src/clp/streaming_archive/writer/utils.cpp index 983f61b24..60a7c2448 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.cpp +++ b/components/core/src/clp/streaming_archive/writer/utils.cpp @@ -12,13 +12,6 @@ using std::string; namespace clp::streaming_archive::writer { auto split_archive(Archive::UserConfig& archive_user_config, Archive& archive_writer) -> void { - if (archive_writer.get_use_single_file_archive()) { - SPDLOG_WARN( - "Target size of dictionaries exceeded. " - "Single-file archive cannot be split into multiple archives." - ); - return; - } archive_writer.close(); archive_user_config.id = boost::uuids::random_generator()(); ++archive_user_config.creation_num; diff --git a/components/core/src/clp/streaming_archive/writer/utils.hpp b/components/core/src/clp/streaming_archive/writer/utils.hpp index 606c72a1a..5c6685408 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.hpp +++ b/components/core/src/clp/streaming_archive/writer/utils.hpp @@ -11,8 +11,6 @@ namespace clp::streaming_archive::writer { /** * Closes the current archive and starts a new one * - * NOTE: This function has no effect on single-file archives - * * @param archive_user_config * @param archive_writer */ From 5c75147246ea26ad97befee01679798a3675039a Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 23:22:40 +0000 Subject: [PATCH 09/35] remove extra line --- components/core/src/clp/streaming_archive/writer/utils.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/writer/utils.hpp b/components/core/src/clp/streaming_archive/writer/utils.hpp index 5c6685408..be56d79da 100644 --- a/components/core/src/clp/streaming_archive/writer/utils.hpp +++ b/components/core/src/clp/streaming_archive/writer/utils.hpp @@ -10,7 +10,6 @@ namespace clp::streaming_archive::writer { /** * Closes the current archive and starts a new one - * * @param archive_user_config * @param archive_writer */ From c1f12df0a13b1c1ada67109a933f0ebcc9145166 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 23:26:18 +0000 Subject: [PATCH 10/35] fix lint --- .../core/src/clp/clp/FileCompressor.cpp | 9 ++++++--- components/core/src/clp/clp/compression.cpp | 6 ++++-- .../clp/streaming_archive/ArchiveMetadata.hpp | 14 ++++++++++---- .../single_file_archive/Defs.hpp | 6 ++++-- .../single_file_archive/writer.cpp | 19 ++++++++++--------- .../single_file_archive/writer.hpp | 3 +-- .../clp/streaming_archive/writer/Archive.cpp | 9 +++++++-- 7 files changed, 42 insertions(+), 24 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 6a1b02b24..d71f8ac6d 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -244,7 +244,8 @@ void FileCompressor::parse_and_encode_with_heuristic( // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) { + && false == archive_writer.get_use_single_file_archive()) + { split_file_and_archive( archive_user_config, path_for_compression, @@ -339,7 +340,8 @@ bool FileCompressor::try_compressing_as_archive( } if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) { + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } @@ -540,7 +542,8 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( // Split archive/encoded file if necessary before writing the new event if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive.get_use_single_file_archive()) { + && false == archive.get_use_single_file_archive()) + { split_file_and_archive( archive_user_config, path, diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index a2a22dac1..cd4619eaa 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -137,7 +137,8 @@ bool compress( } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) { + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } if (false @@ -166,7 +167,8 @@ bool compress( // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) { + && false == archive_writer.get_use_single_file_archive()) + { split_archive(archive_user_config, archive_writer); } if (false diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index e81aa88d3..79df0ac7a 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -4,10 +4,10 @@ #include #include "../Defs.h" +#include "../ffi/encoding_methods.hpp" #include "../FileReader.hpp" #include "../FileWriter.hpp" #include "Constants.hpp" -#include "../ffi/encoding_methods.hpp" namespace clp::streaming_archive { @@ -83,11 +83,17 @@ class ArchiveMetadata { [[nodiscard]] auto get_end_timestamp() const { return m_end_timestamp; } - [[nodiscard]] auto get_variable_encoding_methods_version() const -> std::string const& { return m_variable_encoding_methods_version; } + [[nodiscard]] auto get_variable_encoding_methods_version() const -> std::string const& { + return m_variable_encoding_methods_version; + } - [[nodiscard]] auto get_variables_schema_version() const -> std::string const& { return m_variables_schema_version; } + [[nodiscard]] auto get_variables_schema_version() const -> std::string const& { + return m_variables_schema_version; + } - [[nodiscard]] auto get_compression_type() const -> std::string const& { return m_compression_type; } + [[nodiscard]] auto get_compression_type() const -> std::string const& { + return m_compression_type; + } /** * Expands the archive's time range based to encompass the given time range diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index d7ad86cf9..c04888d6f 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -21,12 +21,13 @@ constexpr single_file_archive_format_version_t cArchiveVersion{ }; static constexpr size_t cNumMagicNumberChars{4}; -static constexpr std::array cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; +static constexpr std::array + cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; static constexpr std::string_view cUnstructuredSfaExtension{".clp"}; static constexpr size_t cFileSizeWarningThreshold{100L * 1024 * 1024}; static constexpr size_t cNumStaticFiles{5}; -constexpr std::array cStaticArchiveFileNames{ +constexpr std::array cStaticArchiveFileNames{ cMetadataDBFileName, cLogTypeDictFilename, cLogTypeSegmentIndexFilename, @@ -35,6 +36,7 @@ constexpr std::array cStaticArchiveFileNames{ }; static constexpr size_t cNumUnused{6}; + struct __attribute__((packed)) SingleFileArchiveHeader { std::array magic; single_file_archive_format_version_t version; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index f685d7622..5a242a81d 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -2,11 +2,11 @@ #include #include -#include #include #include #include +#include #include "../../ErrorCode.hpp" #include "../../TraceableException.hpp" @@ -43,8 +43,8 @@ auto get_file_infos( ) -> std::vector; /** - * Combines file section metadata, multi-file archive metadata, and the number of segments into single-file archive metadata. - * Once combined, serializes the metadata into MsgPack format. + * Combines file section metadata, multi-file archive metadata, and the number of segments into + * single-file archive metadata. Once combined, serializes the metadata into MsgPack format. * * @param multi_file_archive_metadata * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. @@ -72,7 +72,7 @@ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_siz * @param packed_metadata Packed metadata. */ auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) - -> void; + -> void; /** * Reads the content of a file and writes it to the single-file archive. @@ -136,10 +136,10 @@ auto get_file_infos( // single-file archives are not split. if (offset > cFileSizeWarningThreshold) { SPDLOG_WARN( - "Single file archive size exceeded {}. " - "The single-file archive format is not intended for large archives, " - " consider using multi-file archive format instead.", - cFileSizeWarningThreshold + "Single file archive size exceeded {}. " + "The single-file archive format is not intended for large archives, " + " consider using multi-file archive format instead.", + cFileSizeWarningThreshold ); } @@ -218,7 +218,8 @@ auto write_archive_files( std::vector const& segment_ids ) -> void { for (auto const& static_archive_file_name : cStaticArchiveFileNames) { - std::filesystem::path static_archive_file_path = multi_file_archive_path / static_archive_file_name; + std::filesystem::path static_archive_file_path + = multi_file_archive_path / static_archive_file_name; write_archive_file(static_archive_file_path, archive_writer); } diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index b9ad3b17b..02b3e8328 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -20,8 +20,7 @@ class OperationFailed : public TraceableException { ErrorCode error_code, char const* const filename, int line_number, - std::string message - = "streaming_archive::single_file_archive operation failed" + std::string message = "streaming_archive::single_file_archive operation failed" ) : TraceableException{error_code, filename, line_number}, m_message{std::move(message)} {} diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 604826756..5cd685551 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -339,7 +339,8 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { } } if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts - && false == m_use_single_file_archive) { + && false == m_use_single_file_archive) + { split_file_and_archive( m_archive_user_config, m_path_for_compression, @@ -675,7 +676,11 @@ void Archive::create_single_file_archive() { segment_ids ); - single_file_archive::write_single_file_archive(multi_file_archive_path, packed_metadata, segment_ids); + single_file_archive::write_single_file_archive( + multi_file_archive_path, + packed_metadata, + segment_ids + ); } // Explicitly declare template specializations so that we can define the template methods in this From 0ab6e0e3badf5ac7bd8e4b96cdd38ed1da533eee Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 23:37:32 +0000 Subject: [PATCH 11/35] fix lint --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 5a242a81d..66db722ec 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -119,7 +119,7 @@ auto get_file_infos( uint64_t offset = 0; for (auto const& static_archive_file_name : cStaticArchiveFileNames) { - files.emplace_back(static_archive_file_name, offset); + files.emplace_back(std::string(static_archive_file_name), offset); update_offset(multi_file_archive_path / static_archive_file_name, offset); } From d4ed4f6a0f11e24803f3a715222e35cf7e802d0e Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sat, 28 Dec 2024 23:47:39 +0000 Subject: [PATCH 12/35] fix lint --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 66db722ec..246f4ea01 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -119,13 +119,13 @@ auto get_file_infos( uint64_t offset = 0; for (auto const& static_archive_file_name : cStaticArchiveFileNames) { - files.emplace_back(std::string(static_archive_file_name), offset); + files.emplace_back(FileInfo{std::string(static_archive_file_name), offset}); update_offset(multi_file_archive_path / static_archive_file_name, offset); } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; for (auto const& segment_id : segment_ids) { - files.emplace_back(segment_id, offset); + files.emplace_back(FileInfo{segment_id, offset}); update_offset(segment_dir_path / segment_id, offset); } From 82b98029c2601c74f83fbe71cfe29986d0daad83 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 29 Dec 2024 00:05:27 +0000 Subject: [PATCH 13/35] fix lint? --- .../clp/streaming_archive/writer/Archive.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 5cd685551..dec65c506 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -20,7 +20,6 @@ #include "../../spdlog_with_specializations.hpp" #include "../../Utils.hpp" #include "../Constants.hpp" -#include "../single_file_archive/Defs.hpp" #include "../single_file_archive/writer.hpp" #include "utils.hpp" @@ -663,20 +662,22 @@ void Archive::update_metadata() { void Archive::create_single_file_archive() { std::filesystem::path multi_file_archive_path = m_path; - auto segment_ids = single_file_archive::get_segment_ids(m_next_segment_id - 1); + auto segment_ids + = clp::streaming_archive::single_file_archive::get_segment_ids(m_next_segment_id - 1); if (false == m_local_metadata.has_value()) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } auto& multi_file_archive_metadata = m_local_metadata.value(); - auto packed_metadata = single_file_archive::create_single_file_archive_metadata( - multi_file_archive_metadata, - multi_file_archive_path, - segment_ids - ); + auto packed_metadata + = clp::streaming_archive::single_file_archive::create_single_file_archive_metadata( + multi_file_archive_metadata, + multi_file_archive_path, + segment_ids + ); - single_file_archive::write_single_file_archive( + clp::streaming_archive::single_file_archive::write_single_file_archive( multi_file_archive_path, packed_metadata, segment_ids From 393049bf071047007247a4fa35bfc2b3cdd95d65 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 29 Dec 2024 00:32:25 +0000 Subject: [PATCH 14/35] fix lint? --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 246f4ea01..91932ff65 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -184,9 +184,7 @@ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_siz .unused{} }; std::memcpy(&header.magic, cUnstructuredSfaMagicNumber.data(), sizeof(header.magic)); - - auto char_buffer = std::bit_cast>(header); - archive_writer.write(char_buffer.data(), char_buffer.size()); + archive_writer.write(reinterpret_cast(&header), sizeof(header)); } auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) From 54284034155d956976800a41dcc908ba2dfe8947 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 29 Dec 2024 00:51:37 +0000 Subject: [PATCH 15/35] fix lint --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 1 + .../src/clp/streaming_archive/single_file_archive/writer.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 91932ff65..72f836891 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -8,6 +8,7 @@ #include #include +#include "../../Defs.h" #include "../../ErrorCode.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 02b3e8328..0dee9c4fe 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -7,6 +7,7 @@ #include +#include "../../Defs.h" #include "../../ErrorCode.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" From 7e261f7142a8e012522032460db9c4e7221ccc3e Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 29 Dec 2024 02:09:14 +0000 Subject: [PATCH 16/35] fix lint --- .../core/src/clp/streaming_archive/single_file_archive/Defs.hpp | 2 +- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index c04888d6f..5b501c1bb 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -4,7 +4,7 @@ #include #include -#include "../clp/Defs.h" +#include "../../Defs.h" #include "../Constants.hpp" #include "msgpack.hpp" diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 72f836891..4231ee941 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -10,6 +10,7 @@ #include "../../Defs.h" #include "../../ErrorCode.hpp" +#include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" #include "../Constants.hpp" From 0bd9b27585c0366f9d495e9c021ac5b1ac41d30f Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 29 Dec 2024 02:54:46 +0000 Subject: [PATCH 17/35] fix lint --- components/core/CMakeLists.txt | 3 +++ .../clp/streaming_archive/single_file_archive/writer.cpp | 6 ++++-- .../clp/streaming_archive/single_file_archive/writer.hpp | 4 ++-- .../core/src/clp/streaming_archive/writer/Archive.cpp | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index ce74f04cc..ae9ed4620 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -505,6 +505,9 @@ set(SOURCE_FILES_unitTest src/clp/streaming_archive/reader/Segment.hpp src/clp/streaming_archive/reader/SegmentManager.cpp src/clp/streaming_archive/reader/SegmentManager.hpp + src/clp/streaming_archive/single_file_archive/Defs.hpp + src/clp/streaming_archive/single_file_archive/writer.cpp + src/clp/streaming_archive/single_file_archive/writer.hpp src/clp/streaming_archive/writer/Archive.cpp src/clp/streaming_archive/writer/Archive.hpp src/clp/streaming_archive/writer/File.cpp diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 4231ee941..ad7160018 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -10,6 +11,7 @@ #include "../../Defs.h" #include "../../ErrorCode.hpp" +#include "../../FileReader.hpp" #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../ArchiveMetadata.hpp" @@ -254,11 +256,11 @@ auto create_single_file_archive_metadata( return pack_single_file_archive_metadata(multi_file_archive_metadata, file_infos, segment_ids); } -void write_single_file_archive( +auto write_single_file_archive( std::filesystem::path const& multi_file_archive_path, std::stringstream const& packed_metadata, std::vector const& segment_ids -) { +) -> void { FileWriter archive_writer; std::filesystem::path single_file_archive_path = multi_file_archive_path.string() diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 0dee9c4fe..7a5947b02 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -62,11 +62,11 @@ auto create_single_file_archive_metadata( * @param segment_ids * @throws OperationFailed if single-file archive path already exists. */ -void write_single_file_archive( +auto write_single_file_archive( std::filesystem::path const& multi_file_archive_path, std::stringstream const& packed_metadata, std::vector const& segment_ids -); +) -> void; } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index dec65c506..94cfba5c5 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -15,6 +15,7 @@ #include #include +#include "../../Defs.h" #include "../../EncodedVariableInterpreter.hpp" #include "../../ir/types.hpp" #include "../../spdlog_with_specializations.hpp" From d20763059f1ef864f6596335c4f24fcc0803e383 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Mon, 13 Jan 2025 21:32:04 +0000 Subject: [PATCH 18/35] haiqi review --- .../core/src/clp/clp/CommandLineArguments.cpp | 2 +- .../core/src/clp/clp/CommandLineArguments.hpp | 2 +- components/core/src/clp/clp/compression.cpp | 2 +- .../single_file_archive/writer.cpp | 80 ++++++++++--------- .../single_file_archive/writer.hpp | 28 +------ .../clp/streaming_archive/writer/Archive.cpp | 19 +---- .../clp/streaming_archive/writer/Archive.hpp | 4 +- 7 files changed, 57 insertions(+), 80 deletions(-) diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index 679734b11..266d1be62 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -376,7 +376,7 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { )( "single-file-archive", po::bool_switch(&m_single_file_archive), - "Output archive as a single-file" + "Output archive as a single-file archive" ); po::options_description all_compression_options; diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp index 32e19432c..307427210 100644 --- a/components/core/src/clp/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/clp/CommandLineArguments.hpp @@ -46,7 +46,7 @@ class CommandLineArguments : public CommandLineArgumentsBase { bool show_progress() const { return m_show_progress; } - bool get_use_single_file_archive() const { return m_single_file_archive; } + [[nodiscard]] auto single_file_archive() const -> bool { return m_single_file_archive; } bool sort_input_files() const { return m_sort_input_files; } diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index cd4619eaa..05a001851 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -107,7 +107,7 @@ bool compress( archive_user_config.global_metadata_db = global_metadata_db.get(); archive_user_config.print_archive_stats_progress = command_line_args.print_archive_stats_progress(); - archive_user_config.use_single_file_archive = command_line_args.get_use_single_file_archive(); + archive_user_config.use_single_file_archive = command_line_args.single_file_archive(); // Open Archive streaming_archive::writer::Archive archive_writer; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index ad7160018..086edcd65 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -1,5 +1,6 @@ #include "writer.hpp" +#include #include #include #include @@ -23,6 +24,12 @@ namespace clp::streaming_archive::single_file_archive { namespace { constexpr size_t cReadBlockSize = 4096; +/** + * @param next_segment_id ID of the next segment to be created in the archive. + * @return Vector of segment IDs. + */ +[[nodiscard]] auto get_segment_ids(segment_id_t next_segment_id) -> std::vector; + /** * Gets the size of a file specified by `file_path` and adds it to file section `offset`. * @param file_path @@ -30,7 +37,7 @@ constexpr size_t cReadBlockSize = 4096; * represents the starting position of the next file in single-file archive. * @throws OperationFailed if error getting file size. */ -void update_offset(std::filesystem::path const& file_path, uint64_t& offset); +auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void; /** * Generates metadata for the file section of a single-file archive. The metadata consists @@ -41,7 +48,7 @@ void update_offset(std::filesystem::path const& file_path, uint64_t& offset); * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. * @throws Propagates `update_offset`'s exceptions. */ -auto get_file_infos( +[[nodiscard]] auto get_file_infos( std::filesystem::path const& multi_file_archive_path, std::vector const& segment_ids ) -> std::vector; @@ -52,13 +59,13 @@ auto get_file_infos( * * @param multi_file_archive_metadata * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @param segment_ids + * @param num_segments * @return Packed metadata. */ -auto pack_single_file_archive_metadata( +[[nodiscard]] auto pack_single_file_archive_metadata( ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, - std::vector const& segment_ids + size_t num_segments ) -> std::stringstream; /** @@ -84,7 +91,7 @@ auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& * @param archive_writer * @throws OperationFailed if reading the file fails. */ -void write_archive_file(std::string const& file_path, FileWriter& archive_writer); +auto write_archive_file(std::string const& file_path, FileWriter& archive_writer) -> void; /** * Iterates over files in the multi-file archive copying their contents to the single-file archive. @@ -101,7 +108,21 @@ auto write_archive_files( std::vector const& segment_ids ) -> void; -void update_offset(std::filesystem::path const& file_path, uint64_t& offset) { +auto get_segment_ids(segment_id_t next_segment_id) -> std::vector { + std::vector segment_ids; + + if (next_segment_id == 0) { + return segment_ids; + } + + for (size_t i = 0; i < next_segment_id; ++i) { + segment_ids.emplace_back(std::to_string(i)); + } + + return segment_ids; +} + +auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void { try { auto size = std::filesystem::file_size(file_path); offset += size; @@ -124,13 +145,13 @@ auto get_file_infos( for (auto const& static_archive_file_name : cStaticArchiveFileNames) { files.emplace_back(FileInfo{std::string(static_archive_file_name), offset}); - update_offset(multi_file_archive_path / static_archive_file_name, offset); + get_file_size_and_update_offset(multi_file_archive_path / static_archive_file_name, offset); } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; for (auto const& segment_id : segment_ids) { files.emplace_back(FileInfo{segment_id, offset}); - update_offset(segment_dir_path / segment_id, offset); + get_file_size_and_update_offset(segment_dir_path / segment_id, offset); } // Add sentinel indicating total size of all files. @@ -153,7 +174,7 @@ auto get_file_infos( auto pack_single_file_archive_metadata( ArchiveMetadata const& multi_file_archive_metadata, std::vector const& file_infos, - std::vector const& segment_ids + size_t num_segments ) -> std::stringstream { MultiFileArchiveMetadata archive_metadata{ .archive_format_version = multi_file_archive_metadata.get_archive_format_version(), @@ -171,7 +192,7 @@ auto pack_single_file_archive_metadata( SingleFileArchiveMetadata single_file_archive{ .archive_files = file_infos, .archive_metadata = archive_metadata, - .num_segments = segment_ids.size(), + .num_segments = num_segments, }; std::stringstream buf; @@ -233,34 +254,12 @@ auto write_archive_files( } } // namespace -auto get_segment_ids(segment_id_t last_segment_id) -> std::vector { - std::vector segment_ids; - - if (last_segment_id < 0) { - return segment_ids; - } - - for (size_t i = 0; i <= last_segment_id; ++i) { - segment_ids.emplace_back(std::to_string(i)); - } - - return segment_ids; -} - -auto create_single_file_archive_metadata( - ArchiveMetadata const& multi_file_archive_metadata, - std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids -) -> std::stringstream { - auto file_infos = get_file_infos(multi_file_archive_path, segment_ids); - return pack_single_file_archive_metadata(multi_file_archive_metadata, file_infos, segment_ids); -} - auto write_single_file_archive( + ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, - std::stringstream const& packed_metadata, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> void { + FileWriter archive_writer; std::filesystem::path single_file_archive_path = multi_file_archive_path.string() @@ -275,6 +274,15 @@ auto write_single_file_archive( FileWriter::OpenMode::CREATE_FOR_WRITING ); + auto const segment_ids + = clp::streaming_archive::single_file_archive::get_segment_ids(next_segment_id); + + auto const packed_metadata = pack_single_file_archive_metadata( + multi_file_archive_metadata, + get_file_infos(multi_file_archive_path, segment_ids), + segment_ids.size() + ); + write_archive_header(archive_writer, packed_metadata.str().size()); write_archive_metadata(archive_writer, packed_metadata); write_archive_files(archive_writer, multi_file_archive_path, segment_ids); diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 7a5947b02..7bd145ad5 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -33,39 +33,19 @@ class OperationFailed : public TraceableException { std::string m_message; }; -/** - * @param last_segment_id ID of last written segment in archive. - * @return Vector of segment IDs. - */ -auto get_segment_ids(segment_id_t last_segment_id) -> std::vector; - -/** - * Generates packed single-file archive metadata. - * - * @param multi_file_archive_metadata - * @param multi_file_archive_path - * @param segment_ids - * @return Packed metadata. - */ -auto create_single_file_archive_metadata( - ArchiveMetadata const& multi_file_archive_metadata, - std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids -) -> std::stringstream; - /** * Writes header, metadata and archive files in single-file format then * removes existing multi-file archive. * + * @param multi_file_archive_metadata * @param multi_file_archive_path - * @param packed_metadata - * @param segment_ids + * @param next_segment_id * @throws OperationFailed if single-file archive path already exists. */ auto write_single_file_archive( + ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, - std::stringstream const& packed_metadata, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> void; } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 94cfba5c5..0b2ea0072 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -661,27 +661,16 @@ void Archive::update_metadata() { } void Archive::create_single_file_archive() { - std::filesystem::path multi_file_archive_path = m_path; - - auto segment_ids - = clp::streaming_archive::single_file_archive::get_segment_ids(m_next_segment_id - 1); - if (false == m_local_metadata.has_value()) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - auto& multi_file_archive_metadata = m_local_metadata.value(); - auto packed_metadata - = clp::streaming_archive::single_file_archive::create_single_file_archive_metadata( - multi_file_archive_metadata, - multi_file_archive_path, - segment_ids - ); + auto const& multi_file_archive_metadata = m_local_metadata.value(); clp::streaming_archive::single_file_archive::write_single_file_archive( - multi_file_archive_path, - packed_metadata, - segment_ids + multi_file_archive_metadata, + m_path, + m_next_segment_id ); } diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index b4a08f3a2..672b048d4 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -194,7 +194,7 @@ class Archive { return m_logtype_dict.get_data_size() + m_var_dict.get_data_size(); } - bool get_use_single_file_archive() const { return m_use_single_file_archive; } + [[nodiscard]] auto get_use_single_file_archive() const -> bool { return m_use_single_file_archive; } private: // Types @@ -285,7 +285,7 @@ class Archive { /** * Writes archive to disk in single-file format then removes existing multi-file archive. */ - void create_single_file_archive(); + auto create_single_file_archive() -> void; // Variables boost::uuids::uuid m_id; From c2fd37d896ecbb4613e49326072ce78bdeb8335a Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Mon, 13 Jan 2025 21:40:33 +0000 Subject: [PATCH 19/35] haiqi review --- .../single_file_archive/writer.cpp | 16 ++++++++-------- .../single_file_archive/writer.hpp | 1 - .../src/clp/streaming_archive/writer/Archive.hpp | 4 +++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 086edcd65..fd7214a4c 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -37,7 +37,8 @@ constexpr size_t cReadBlockSize = 4096; * represents the starting position of the next file in single-file archive. * @throws OperationFailed if error getting file size. */ -auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void; +auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) + -> void; /** * Generates metadata for the file section of a single-file archive. The metadata consists @@ -122,7 +123,8 @@ auto get_segment_ids(segment_id_t next_segment_id) -> std::vector { return segment_ids; } -auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void { +auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) + -> void { try { auto size = std::filesystem::file_size(file_path); offset += size; @@ -259,7 +261,6 @@ auto write_single_file_archive( std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void { - FileWriter archive_writer; std::filesystem::path single_file_archive_path = multi_file_archive_path.string() @@ -274,13 +275,12 @@ auto write_single_file_archive( FileWriter::OpenMode::CREATE_FOR_WRITING ); - auto const segment_ids - = clp::streaming_archive::single_file_archive::get_segment_ids(next_segment_id); + auto const segment_ids = get_segment_ids(next_segment_id); auto const packed_metadata = pack_single_file_archive_metadata( - multi_file_archive_metadata, - get_file_infos(multi_file_archive_path, segment_ids), - segment_ids.size() + multi_file_archive_metadata, + get_file_infos(multi_file_archive_path, segment_ids), + segment_ids.size() ); write_archive_header(archive_writer, packed_metadata.str().size()); diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 7bd145ad5..5f4874fb9 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -3,7 +3,6 @@ #include #include -#include #include diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index 672b048d4..197ec50a5 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -194,7 +194,9 @@ class Archive { return m_logtype_dict.get_data_size() + m_var_dict.get_data_size(); } - [[nodiscard]] auto get_use_single_file_archive() const -> bool { return m_use_single_file_archive; } + [[nodiscard]] auto get_use_single_file_archive() const -> bool { + return m_use_single_file_archive; + } private: // Types From cdafb8d4b57b4232528d5aa381bb5f167dfe3e77 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Thu, 16 Jan 2025 18:37:25 +0000 Subject: [PATCH 20/35] haiqi review --- .../clp/streaming_archive/ArchiveMetadata.hpp | 15 ++++ .../single_file_archive/Defs.hpp | 33 ++----- .../single_file_archive/writer.cpp | 87 +++++++------------ .../single_file_archive/writer.hpp | 2 +- 4 files changed, 52 insertions(+), 85 deletions(-) diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index 79df0ac7a..747b59e6a 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -8,6 +8,7 @@ #include "../FileReader.hpp" #include "../FileWriter.hpp" #include "Constants.hpp" +#include "msgpack.hpp" namespace clp::streaming_archive { @@ -104,6 +105,20 @@ class ArchiveMetadata { void write_to_file(FileWriter& file_writer) const; + // MsgPack serialization used for single-file archive format. Variables are renamed when + // serialized to match single-file archive specification. + MSGPACK_DEFINE_MAP( + MSGPACK_NVP("archive_format_version", m_archive_format_version), + MSGPACK_NVP("variable_encoding_methods_version", m_variable_encoding_methods_version), + MSGPACK_NVP("variables_schema_version", m_variables_schema_version), + MSGPACK_NVP("compression_type", m_compression_type), + MSGPACK_NVP("creator_id", m_creator_id), + MSGPACK_NVP("begin_timestamp", m_begin_timestamp), + MSGPACK_NVP("end_timestamp", m_end_timestamp), + MSGPACK_NVP("uncompressed_size", m_uncompressed_size), + MSGPACK_NVP("compressed_size", m_compressed_size) + ); + private: // Variables archive_format_version_t m_archive_format_version{cArchiveFormatVersion}; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 5b501c1bb..661968781 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -7,6 +7,7 @@ #include "../../Defs.h" #include "../Constants.hpp" #include "msgpack.hpp" +#include "streaming_archive/ArchiveMetadata.hpp" namespace clp::streaming_archive::single_file_archive { @@ -45,37 +46,15 @@ struct __attribute__((packed)) SingleFileArchiveHeader { }; struct FileInfo { - std::string n; - uint64_t o; - MSGPACK_DEFINE_MAP(n, o); -}; - -struct MultiFileArchiveMetadata { - archive_format_version_t archive_format_version; - std::string variable_encoding_methods_version; - std::string variables_schema_version; - std::string compression_type; - std::string creator_id; - epochtime_t begin_timestamp; - epochtime_t end_timestamp; - uint64_t uncompressed_size; - uint64_t compressed_size; - MSGPACK_DEFINE_MAP( - archive_format_version, - variable_encoding_methods_version, - variables_schema_version, - compression_type, - creator_id, - begin_timestamp, - end_timestamp, - uncompressed_size, - compressed_size - ); + std::string name; + uint64_t offset; + // Variables are renamed when serialized to match single-file archive specification. + MSGPACK_DEFINE_MAP(MSGPACK_NVP("n", name), MSGPACK_NVP("o", offset)); }; struct SingleFileArchiveMetadata { std::vector archive_files; - MultiFileArchiveMetadata archive_metadata; + ArchiveMetadata archive_metadata; uint64_t num_segments; MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments); }; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index fd7214a4c..d3a093e79 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -24,12 +24,6 @@ namespace clp::streaming_archive::single_file_archive { namespace { constexpr size_t cReadBlockSize = 4096; -/** - * @param next_segment_id ID of the next segment to be created in the archive. - * @return Vector of segment IDs. - */ -[[nodiscard]] auto get_segment_ids(segment_id_t next_segment_id) -> std::vector; - /** * Gets the size of a file specified by `file_path` and adds it to file section `offset`. * @param file_path @@ -45,13 +39,13 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin * of a list of file names and their corresponding starting offsets. * * @param multi_file_archive_path - * @param segment_ids + * @param next_segment_id * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. * @throws Propagates `update_offset`'s exceptions. */ [[nodiscard]] auto get_file_infos( std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> std::vector; /** @@ -59,14 +53,14 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin * single-file archive metadata. Once combined, serializes the metadata into MsgPack format. * * @param multi_file_archive_metadata - * @param file_infos Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @param num_segments + * @param multi_file_archive_path + * @param next_segment_id * @return Packed metadata. */ [[nodiscard]] auto pack_single_file_archive_metadata( ArchiveMetadata const& multi_file_archive_metadata, - std::vector const& file_infos, - size_t num_segments + std::filesystem::path const& multi_file_archive_path, + segment_id_t next_segment_id ) -> std::stringstream; /** @@ -100,29 +94,15 @@ auto write_archive_file(std::string const& file_path, FileWriter& archive_writer * * @param archive_writer * @param multi_file_archive_path - * @param segment_ids + * @param next_segment_id * @throws Propagates `update_offset`'s exceptions. */ auto write_archive_files( FileWriter& archive_writer, std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> void; -auto get_segment_ids(segment_id_t next_segment_id) -> std::vector { - std::vector segment_ids; - - if (next_segment_id == 0) { - return segment_ids; - } - - for (size_t i = 0; i < next_segment_id; ++i) { - segment_ids.emplace_back(std::to_string(i)); - } - - return segment_ids; -} - auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void { try { @@ -140,7 +120,7 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin auto get_file_infos( std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> std::vector { std::vector files; uint64_t offset = 0; @@ -151,7 +131,9 @@ auto get_file_infos( } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; - for (auto const& segment_id : segment_ids) { + + for (size_t i = 0; i < next_segment_id; ++i) { + auto const segment_id = std::to_string(i); files.emplace_back(FileInfo{segment_id, offset}); get_file_size_and_update_offset(segment_dir_path / segment_id, offset); } @@ -175,26 +157,13 @@ auto get_file_infos( auto pack_single_file_archive_metadata( ArchiveMetadata const& multi_file_archive_metadata, - std::vector const& file_infos, - size_t num_segments + std::filesystem::path const& multi_file_archive_path, + segment_id_t next_segment_id ) -> std::stringstream { - MultiFileArchiveMetadata archive_metadata{ - .archive_format_version = multi_file_archive_metadata.get_archive_format_version(), - .variable_encoding_methods_version - = multi_file_archive_metadata.get_variable_encoding_methods_version(), - .variables_schema_version = multi_file_archive_metadata.get_variables_schema_version(), - .compression_type = multi_file_archive_metadata.get_compression_type(), - .creator_id = multi_file_archive_metadata.get_creator_id(), - .begin_timestamp = multi_file_archive_metadata.get_begin_timestamp(), - .end_timestamp = multi_file_archive_metadata.get_end_timestamp(), - .uncompressed_size = multi_file_archive_metadata.get_uncompressed_size_bytes(), - .compressed_size = multi_file_archive_metadata.get_compressed_size_bytes(), - }; - SingleFileArchiveMetadata single_file_archive{ - .archive_files = file_infos, - .archive_metadata = archive_metadata, - .num_segments = num_segments, + .archive_files = get_file_infos(multi_file_archive_path, next_segment_id), + .archive_metadata = multi_file_archive_metadata, + .num_segments = next_segment_id, }; std::stringstream buf; @@ -210,7 +179,13 @@ auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_siz .metadata_size = packed_metadata_size, .unused{} }; - std::memcpy(&header.magic, cUnstructuredSfaMagicNumber.data(), sizeof(header.magic)); + + static_assert(cUnstructuredSfaMagicNumber.size() == header.magic.size()); + std::memcpy( + &header.magic, + cUnstructuredSfaMagicNumber.data(), + cUnstructuredSfaMagicNumber.size() + ); archive_writer.write(reinterpret_cast(&header), sizeof(header)); } @@ -240,7 +215,7 @@ auto write_archive_file(std::filesystem::path const& file_path, FileWriter& arch auto write_archive_files( FileWriter& archive_writer, std::filesystem::path const& multi_file_archive_path, - std::vector const& segment_ids + segment_id_t next_segment_id ) -> void { for (auto const& static_archive_file_name : cStaticArchiveFileNames) { std::filesystem::path static_archive_file_path @@ -249,8 +224,8 @@ auto write_archive_files( } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; - for (auto const& segment_id : segment_ids) { - std::filesystem::path segment_path = segment_dir_path / segment_id; + for (size_t i = 0; i < next_segment_id; ++i) { + std::filesystem::path segment_path = segment_dir_path / std::to_string(i); write_archive_file(segment_path, archive_writer); } } @@ -275,17 +250,15 @@ auto write_single_file_archive( FileWriter::OpenMode::CREATE_FOR_WRITING ); - auto const segment_ids = get_segment_ids(next_segment_id); - auto const packed_metadata = pack_single_file_archive_metadata( multi_file_archive_metadata, - get_file_infos(multi_file_archive_path, segment_ids), - segment_ids.size() + multi_file_archive_path, + next_segment_id ); write_archive_header(archive_writer, packed_metadata.str().size()); write_archive_metadata(archive_writer, packed_metadata); - write_archive_files(archive_writer, multi_file_archive_path, segment_ids); + write_archive_files(archive_writer, multi_file_archive_path, next_segment_id); archive_writer.close(); try { diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 5f4874fb9..25ccb0175 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -38,7 +38,7 @@ class OperationFailed : public TraceableException { * * @param multi_file_archive_metadata * @param multi_file_archive_path - * @param next_segment_id + * @param next_segment_id ID of the next segment to be created in the archive. * @throws OperationFailed if single-file archive path already exists. */ auto write_single_file_archive( From dcadf0419ad233efe2da01c35b205637b5ab81e0 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Thu, 16 Jan 2025 18:48:33 +0000 Subject: [PATCH 21/35] haiqi review --- .../core/src/clp/streaming_archive/single_file_archive/Defs.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 661968781..9d44e64f7 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -5,9 +5,9 @@ #include #include "../../Defs.h" +#include "../ArchiveMetadata.hpp" #include "../Constants.hpp" #include "msgpack.hpp" -#include "streaming_archive/ArchiveMetadata.hpp" namespace clp::streaming_archive::single_file_archive { From 265b4e421fcb7adfb3bd2a28068e3f4043d01aee Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Thu, 16 Jan 2025 19:22:13 +0000 Subject: [PATCH 22/35] add initializer for variable --- components/core/src/clp/streaming_archive/writer/Archive.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index 197ec50a5..a0be67ce0 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -351,7 +351,7 @@ class Archive { GlobalMetadataDB* m_global_metadata_db; bool m_print_archive_stats_progress; - bool m_use_single_file_archive; + bool m_use_single_file_archive{false}; }; } // namespace clp::streaming_archive::writer From c7361c2c9557b5999eeb497be61db488d10d26e1 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Thu, 16 Jan 2025 19:36:31 +0000 Subject: [PATCH 23/35] attempt to fix lint --- .../single_file_archive/writer.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index d3a093e79..561cd8aa5 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -43,10 +43,9 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. * @throws Propagates `update_offset`'s exceptions. */ -[[nodiscard]] auto get_file_infos( - std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id -) -> std::vector; +[[nodiscard]] auto +get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id) + -> std::vector; /** * Combines file section metadata, multi-file archive metadata, and the number of segments into @@ -118,10 +117,9 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin } } -auto get_file_infos( - std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id -) -> std::vector { +auto +get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id) + -> std::vector { std::vector files; uint64_t offset = 0; From 910e5587c88255345f2a6f296f7f2c9994f6e997 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Mon, 27 Jan 2025 16:11:59 +0000 Subject: [PATCH 24/35] saving previous work --- .../single_file_archive/writer.cpp | 58 +++++++++---------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 561cd8aa5..8bf6f34f0 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -65,39 +65,39 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ /** * Writes single-file archive header. * - * @param archive_writer + * @param single_file_archive_writer * @param packed_metadata_size */ -auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void; +auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void; /** * Writes single-file archive metadata. * - * @param archive_writer + * @param single_file_archive_writer * @param packed_metadata Packed metadata. */ -auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) +auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) -> void; /** * Reads the content of a file and writes it to the single-file archive. * @param file_path - * @param archive_writer + * @param single_file_archive_writer * @throws OperationFailed if reading the file fails. */ -auto write_archive_file(std::string const& file_path, FileWriter& archive_writer) -> void; +auto write_archive_file(std::string const& file_path, FileWriter& single_file_archive_writer) -> void; /** - * Iterates over files in the multi-file archive copying their contents to the single-file archive. - * Skips metadata file since already written in `write_archive_metadata`. + * Iterates over files in the multi-file archive and copies their contents to the single-file archive. + * Skips metadata file, which should be handled by `write_archive_metadata` method. * - * @param archive_writer + * @param single_file_archive_writer * @param multi_file_archive_path * @param next_segment_id * @throws Propagates `update_offset`'s exceptions. */ auto write_archive_files( - FileWriter& archive_writer, + FileWriter& single_file_archive_writer, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void; @@ -170,29 +170,23 @@ auto pack_single_file_archive_metadata( return buf; } -auto write_archive_header(FileWriter& archive_writer, size_t packed_metadata_size) -> void { +auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void { SingleFileArchiveHeader header{ - .magic{}, + .magic = cUnstructuredSfaMagicNumber, .version = cArchiveVersion, .metadata_size = packed_metadata_size, .unused{} }; - static_assert(cUnstructuredSfaMagicNumber.size() == header.magic.size()); - std::memcpy( - &header.magic, - cUnstructuredSfaMagicNumber.data(), - cUnstructuredSfaMagicNumber.size() - ); - archive_writer.write(reinterpret_cast(&header), sizeof(header)); + single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } -auto write_archive_metadata(FileWriter& archive_writer, std::stringstream const& packed_metadata) +auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) -> void { - archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); + single_file_archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); } -auto write_archive_file(std::filesystem::path const& file_path, FileWriter& archive_writer) +auto write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void { FileReader reader(file_path.string()); std::array read_buffer{}; @@ -206,25 +200,25 @@ auto write_archive_file(std::filesystem::path const& file_path, FileWriter& arch if (ErrorCode_Success != error_code) { throw OperationFailed(error_code, __FILENAME__, __LINE__); } - archive_writer.write(read_buffer.data(), num_bytes_read); + single_file_archive_writer.write(read_buffer.data(), num_bytes_read); } } auto write_archive_files( - FileWriter& archive_writer, + FileWriter& single_file_archive_writer, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void { for (auto const& static_archive_file_name : cStaticArchiveFileNames) { std::filesystem::path static_archive_file_path = multi_file_archive_path / static_archive_file_name; - write_archive_file(static_archive_file_path, archive_writer); + write_archive_file(static_archive_file_path, single_file_archive_writer); } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; for (size_t i = 0; i < next_segment_id; ++i) { std::filesystem::path segment_path = segment_dir_path / std::to_string(i); - write_archive_file(segment_path, archive_writer); + write_archive_file(segment_path, single_file_archive_writer); } } } // namespace @@ -234,7 +228,7 @@ auto write_single_file_archive( std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void { - FileWriter archive_writer; + FileWriter single_file_archive_writer; std::filesystem::path single_file_archive_path = multi_file_archive_path.string() + std::string(single_file_archive::cUnstructuredSfaExtension); @@ -243,7 +237,7 @@ auto write_single_file_archive( throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - archive_writer.open( + single_file_archive_writer.open( single_file_archive_path.string(), FileWriter::OpenMode::CREATE_FOR_WRITING ); @@ -254,11 +248,11 @@ auto write_single_file_archive( next_segment_id ); - write_archive_header(archive_writer, packed_metadata.str().size()); - write_archive_metadata(archive_writer, packed_metadata); - write_archive_files(archive_writer, multi_file_archive_path, next_segment_id); + write_archive_header(single_file_archive_writer, packed_metadata.str().size()); + write_archive_metadata(single_file_archive_writer, packed_metadata); + write_archive_files(single_file_archive_writer, multi_file_archive_path, next_segment_id); - archive_writer.close(); + single_file_archive_writer.close(); try { std::filesystem::remove_all(multi_file_archive_path); } catch (std::filesystem::filesystem_error& e) { From a12ac428d17b7335fd54dc430a1c37c975373f93 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 21:00:38 +0000 Subject: [PATCH 25/35] moving to new metadata, sfav2 --- .../core/src/clp/clp/FileCompressor.cpp | 12 ++------ components/core/src/clp/clp/compression.cpp | 6 ++-- .../clp/streaming_archive/ArchiveMetadata.hpp | 1 - .../single_file_archive/Defs.hpp | 14 ++++------ .../single_file_archive/writer.cpp | 28 +------------------ 5 files changed, 12 insertions(+), 49 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index d71f8ac6d..9898602cc 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -243,9 +243,7 @@ void FileCompressor::parse_and_encode_with_heuristic( // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_file_and_archive( archive_user_config, path_for_compression, @@ -339,9 +337,7 @@ bool FileCompressor::try_compressing_as_archive( parent_directories.emplace(file_parent_path); } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive_writer.get_use_single_file_archive()) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_archive(archive_user_config, archive_writer); } @@ -541,9 +537,7 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( } // Split archive/encoded file if necessary before writing the new event - if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts - && false == archive.get_use_single_file_archive()) - { + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_file_and_archive( archive_user_config, path, diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index 05a001851..391432456 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -136,8 +136,7 @@ bool compress( ); } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } @@ -166,8 +165,7 @@ bool compress( file_group_id_comparator); // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries - && false == archive_writer.get_use_single_file_archive()) + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp index 59f993750..dc417432c 100644 --- a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp +++ b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp @@ -11,7 +11,6 @@ #include "Constants.hpp" namespace clp::streaming_archive { - /** * A class to encapsulate metadata directly relating to an archive. */ diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 9d44e64f7..238fa0b63 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -14,21 +14,19 @@ namespace clp::streaming_archive::single_file_archive { using single_file_archive_format_version_t = uint32_t; // Single file archive version. -constexpr uint8_t cArchiveMajorVersion{0}; -constexpr uint8_t cArchiveMinorVersion{1}; -constexpr uint16_t cArchivePatchVersion{1}; -constexpr single_file_archive_format_version_t cArchiveVersion{ - cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion -}; +constexpr uint8_t cVersionMajor{0}; +constexpr uint8_t cVersionMinor{1}; +constexpr uint16_t cVersionPatch{1}; +constexpr single_file_archive_format_version_t cVersion{cVersionMajor << 24 | cVersionMinor << 16 | cVersionPatch}; static constexpr size_t cNumMagicNumberChars{4}; static constexpr std::array cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; static constexpr std::string_view cUnstructuredSfaExtension{".clp"}; -static constexpr size_t cFileSizeWarningThreshold{100L * 1024 * 1024}; -static constexpr size_t cNumStaticFiles{5}; +static constexpr size_t cNumStaticFiles{6}; constexpr std::array cStaticArchiveFileNames{ + cMetadataFileName, cMetadataDBFileName, cLogTypeDictFilename, cLogTypeSegmentIndexFilename, diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 8bf6f34f0..eb2071d59 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -70,15 +70,6 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ */ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void; -/** - * Writes single-file archive metadata. - * - * @param single_file_archive_writer - * @param packed_metadata Packed metadata. - */ -auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) - -> void; - /** * Reads the content of a file and writes it to the single-file archive. * @param file_path @@ -139,17 +130,6 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ // Add sentinel indicating total size of all files. files.emplace_back(FileInfo{"", offset}); - // Decompression of large single-file archives will consume excessive memory since - // single-file archives are not split. - if (offset > cFileSizeWarningThreshold) { - SPDLOG_WARN( - "Single file archive size exceeded {}. " - "The single-file archive format is not intended for large archives, " - " consider using multi-file archive format instead.", - cFileSizeWarningThreshold - ); - } - return files; } @@ -173,7 +153,7 @@ auto pack_single_file_archive_metadata( auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void { SingleFileArchiveHeader header{ .magic = cUnstructuredSfaMagicNumber, - .version = cArchiveVersion, + .version = cVersion, .metadata_size = packed_metadata_size, .unused{} }; @@ -181,11 +161,6 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } -auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) - -> void { - single_file_archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); -} - auto write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void { FileReader reader(file_path.string()); @@ -249,7 +224,6 @@ auto write_single_file_archive( ); write_archive_header(single_file_archive_writer, packed_metadata.str().size()); - write_archive_metadata(single_file_archive_writer, packed_metadata); write_archive_files(single_file_archive_writer, multi_file_archive_path, next_segment_id); single_file_archive_writer.close(); From 4c26177ab021c5bfbe0bcc8140152154c8437970 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 21:25:18 +0000 Subject: [PATCH 26/35] small changes --- components/core/src/clp/clp/compression.cpp | 6 ++---- .../single_file_archive/Defs.hpp | 12 +++++++----- .../single_file_archive/writer.cpp | 18 +++++++++++------- .../single_file_archive/writer.hpp | 4 ++-- tools/yscope-dev-utils | 2 +- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp index 391432456..32c7d25ae 100644 --- a/components/core/src/clp/clp/compression.cpp +++ b/components/core/src/clp/clp/compression.cpp @@ -136,8 +136,7 @@ bool compress( ); } for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } if (false @@ -165,8 +164,7 @@ bool compress( file_group_id_comparator); // Compress grouped files for (auto const& file_to_compress : grouped_files_to_compress) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) - { + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { split_archive(archive_user_config, archive_writer); } if (false diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 238fa0b63..b817347c6 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -4,10 +4,10 @@ #include #include -#include "../../Defs.h" +#include + #include "../ArchiveMetadata.hpp" #include "../Constants.hpp" -#include "msgpack.hpp" namespace clp::streaming_archive::single_file_archive { @@ -15,9 +15,11 @@ using single_file_archive_format_version_t = uint32_t; // Single file archive version. constexpr uint8_t cVersionMajor{0}; -constexpr uint8_t cVersionMinor{1}; -constexpr uint16_t cVersionPatch{1}; -constexpr single_file_archive_format_version_t cVersion{cVersionMajor << 24 | cVersionMinor << 16 | cVersionPatch}; +constexpr uint8_t cVersionMinor{2}; +constexpr uint16_t cVersionPatch{0}; +constexpr single_file_archive_format_version_t cVersion{ + cVersionMajor << 24 | cVersionMinor << 16 | cVersionPatch +}; static constexpr size_t cNumMagicNumberChars{4}; static constexpr std::array diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index eb2071d59..d93db3475 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -49,7 +49,7 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ /** * Combines file section metadata, multi-file archive metadata, and the number of segments into - * single-file archive metadata. Once combined, serializes the metadata into MsgPack format. + * single-file archive metadata, then serializes the metadata into MsgPack format. * * @param multi_file_archive_metadata * @param multi_file_archive_path @@ -68,7 +68,8 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ * @param single_file_archive_writer * @param packed_metadata_size */ -auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void; +auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) + -> void; /** * Reads the content of a file and writes it to the single-file archive. @@ -76,11 +77,12 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ * @param single_file_archive_writer * @throws OperationFailed if reading the file fails. */ -auto write_archive_file(std::string const& file_path, FileWriter& single_file_archive_writer) -> void; +auto write_archive_file(std::string const& file_path, FileWriter& single_file_archive_writer) + -> void; /** - * Iterates over files in the multi-file archive and copies their contents to the single-file archive. - * Skips metadata file, which should be handled by `write_archive_metadata` method. + * Iterates over files in the multi-file archive and copies their contents to the single-file + * archive. Skips metadata file, which should be handled by `write_archive_metadata` method. * * @param single_file_archive_writer * @param multi_file_archive_path @@ -150,7 +152,8 @@ auto pack_single_file_archive_metadata( return buf; } -auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void { +auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) + -> void { SingleFileArchiveHeader header{ .magic = cUnstructuredSfaMagicNumber, .version = cVersion, @@ -161,7 +164,8 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } -auto write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) +auto +write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void { FileReader reader(file_path.string()); std::array read_buffer{}; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 25ccb0175..276fe99a8 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -33,8 +33,8 @@ class OperationFailed : public TraceableException { }; /** - * Writes header, metadata and archive files in single-file format then - * removes existing multi-file archive. + * Writes the header and archive files into a single-file format then + * removes deletes the multi-file archive. * * @param multi_file_archive_metadata * @param multi_file_archive_path diff --git a/tools/yscope-dev-utils b/tools/yscope-dev-utils index c8b5d5c26..c29580c5d 160000 --- a/tools/yscope-dev-utils +++ b/tools/yscope-dev-utils @@ -1 +1 @@ -Subproject commit c8b5d5c26506a1db8d5ec0f183f4e5fa7417fb66 +Subproject commit c29580c5dac55d46f1457b7ca8a58fe1517c1cd2 From 7916bb93b3713dbb4c41c162752ea9009d7a3719 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 21:34:42 +0000 Subject: [PATCH 27/35] small changes --- .../single_file_archive/Defs.hpp | 3 +-- .../single_file_archive/writer.cpp | 16 ++++++++++++++++ .../single_file_archive/writer.hpp | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index b817347c6..752e04560 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -54,9 +54,8 @@ struct FileInfo { struct SingleFileArchiveMetadata { std::vector archive_files; - ArchiveMetadata archive_metadata; uint64_t num_segments; - MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments); + MSGPACK_DEFINE_MAP(archive_files, num_segments); }; } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index d93db3475..327fee093 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -71,6 +71,15 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_metadata_size) -> void; +/** + * Writes single-file archive metadata. + * + * @param single_file_archive_writer + * @param packed_metadata Packed metadata. + */ +auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) + -> void; + /** * Reads the content of a file and writes it to the single-file archive. * @param file_path @@ -164,6 +173,11 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } +auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) + -> void { + single_file_archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); +} + auto write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void { @@ -228,11 +242,13 @@ auto write_single_file_archive( ); write_archive_header(single_file_archive_writer, packed_metadata.str().size()); + write_archive_metadata(single_file_archive_writer, packed_metadata); write_archive_files(single_file_archive_writer, multi_file_archive_path, next_segment_id); single_file_archive_writer.close(); try { std::filesystem::remove_all(multi_file_archive_path); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } catch (std::filesystem::filesystem_error& e) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 276fe99a8..64e258092 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -33,7 +33,7 @@ class OperationFailed : public TraceableException { }; /** - * Writes the header and archive files into a single-file format then + * Writes the header, metadata and archive files into a single-file format then * removes deletes the multi-file archive. * * @param multi_file_archive_metadata From 7fc5cb94221457483c5b314ad4bcce007e65f966 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 21:39:24 +0000 Subject: [PATCH 28/35] latest changes --- .../single_file_archive/writer.cpp | 8 ++------ .../single_file_archive/writer.hpp | 2 -- .../clp/streaming_archive/writer/Archive.cpp | 18 +++--------------- .../clp/streaming_archive/writer/Archive.hpp | 5 ----- 4 files changed, 5 insertions(+), 28 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 327fee093..1b74ceb95 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -48,16 +48,14 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ -> std::vector; /** - * Combines file section metadata, multi-file archive metadata, and the number of segments into - * single-file archive metadata, then serializes the metadata into MsgPack format. + * Combines file section metadata and the number of segments into single-file archive metadata, + * then serializes the metadata into MsgPack format. * - * @param multi_file_archive_metadata * @param multi_file_archive_path * @param next_segment_id * @return Packed metadata. */ [[nodiscard]] auto pack_single_file_archive_metadata( - ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> std::stringstream; @@ -145,13 +143,11 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ } auto pack_single_file_archive_metadata( - ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> std::stringstream { SingleFileArchiveMetadata single_file_archive{ .archive_files = get_file_infos(multi_file_archive_path, next_segment_id), - .archive_metadata = multi_file_archive_metadata, .num_segments = next_segment_id, }; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 64e258092..8d4176606 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -36,13 +36,11 @@ class OperationFailed : public TraceableException { * Writes the header, metadata and archive files into a single-file format then * removes deletes the multi-file archive. * - * @param multi_file_archive_metadata * @param multi_file_archive_path * @param next_segment_id ID of the next segment to be created in the archive. * @throws OperationFailed if single-file archive path already exists. */ auto write_single_file_archive( - ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void; diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index f88569574..669878af0 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -247,7 +247,9 @@ void Archive::close() { m_metadata_db.close(); if (m_use_single_file_archive) { - create_single_file_archive(); + single_file_archive::write_single_file_archive( + m_path, + m_next_segment_id ); } m_creator_id_as_string.clear(); @@ -657,20 +659,6 @@ void Archive::update_metadata() { } } -void Archive::create_single_file_archive() { - if (false == m_local_metadata.has_value()) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - auto const& multi_file_archive_metadata = m_local_metadata.value(); - - clp::streaming_archive::single_file_archive::write_single_file_archive( - multi_file_archive_metadata, - m_path, - m_next_segment_id - ); -} - // Explicitly declare template specializations so that we can define the template methods in this // file template void Archive::write_log_event_ir( diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index a0be67ce0..082a6bf12 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -284,11 +284,6 @@ class Archive { */ void update_metadata(); - /** - * Writes archive to disk in single-file format then removes existing multi-file archive. - */ - auto create_single_file_archive() -> void; - // Variables boost::uuids::uuid m_id; std::string m_id_as_string; From 0ae482262f3f3d3e00ef596eae72c88b36d6f60b Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 21:45:35 +0000 Subject: [PATCH 29/35] small --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 1b74ceb95..77864f544 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -232,7 +232,6 @@ auto write_single_file_archive( ); auto const packed_metadata = pack_single_file_archive_metadata( - multi_file_archive_metadata, multi_file_archive_path, next_segment_id ); From 1b7d73e526e2242c50a97a5da945166176883de5 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 22:02:06 +0000 Subject: [PATCH 30/35] small changes --- .../single_file_archive/writer.cpp | 22 +++++++++---------- .../single_file_archive/writer.hpp | 3 +-- .../clp/streaming_archive/writer/Archive.cpp | 4 +--- tools/yscope-dev-utils | 2 +- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 77864f544..823f9acdd 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -75,8 +75,10 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ * @param single_file_archive_writer * @param packed_metadata Packed metadata. */ -auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) - -> void; +auto write_archive_metadata( + FileWriter& single_file_archive_writer, + std::stringstream const& packed_metadata +) -> void; /** * Reads the content of a file and writes it to the single-file archive. @@ -89,7 +91,7 @@ auto write_archive_file(std::string const& file_path, FileWriter& single_file_ar /** * Iterates over files in the multi-file archive and copies their contents to the single-file - * archive. Skips metadata file, which should be handled by `write_archive_metadata` method. + * archive. * * @param single_file_archive_writer * @param multi_file_archive_path @@ -169,8 +171,10 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } -auto write_archive_metadata(FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata) - -> void { +auto write_archive_metadata( + FileWriter& single_file_archive_writer, + std::stringstream const& packed_metadata +) -> void { single_file_archive_writer.write(packed_metadata.str().data(), packed_metadata.str().size()); } @@ -213,7 +217,6 @@ auto write_archive_files( } // namespace auto write_single_file_archive( - ArchiveMetadata const& multi_file_archive_metadata, std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id ) -> void { @@ -231,10 +234,8 @@ auto write_single_file_archive( FileWriter::OpenMode::CREATE_FOR_WRITING ); - auto const packed_metadata = pack_single_file_archive_metadata( - multi_file_archive_path, - next_segment_id - ); + auto const packed_metadata + = pack_single_file_archive_metadata(multi_file_archive_path, next_segment_id); write_archive_header(single_file_archive_writer, packed_metadata.str().size()); write_archive_metadata(single_file_archive_writer, packed_metadata); @@ -243,7 +244,6 @@ auto write_single_file_archive( single_file_archive_writer.close(); try { std::filesystem::remove_all(multi_file_archive_path); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } catch (std::filesystem::filesystem_error& e) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 8d4176606..fefb61cd4 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -33,8 +33,7 @@ class OperationFailed : public TraceableException { }; /** - * Writes the header, metadata and archive files into a single-file format then - * removes deletes the multi-file archive. + * Writes a single-file archive then removes deletes the multi-file archive. * * @param multi_file_archive_path * @param next_segment_id ID of the next segment to be created in the archive. diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 669878af0..dc67c32f5 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -247,9 +247,7 @@ void Archive::close() { m_metadata_db.close(); if (m_use_single_file_archive) { - single_file_archive::write_single_file_archive( - m_path, - m_next_segment_id ); + single_file_archive::write_single_file_archive(m_path, m_next_segment_id); } m_creator_id_as_string.clear(); diff --git a/tools/yscope-dev-utils b/tools/yscope-dev-utils index c29580c5d..c8b5d5c26 160000 --- a/tools/yscope-dev-utils +++ b/tools/yscope-dev-utils @@ -1 +1 @@ -Subproject commit c29580c5dac55d46f1457b7ca8a58fe1517c1cd2 +Subproject commit c8b5d5c26506a1db8d5ec0f183f4e5fa7417fb66 From 7b685449cf743cdd1bb9ad1873392850622247ae Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Sun, 9 Feb 2025 22:07:08 +0000 Subject: [PATCH 31/35] small changes --- .../clp/streaming_archive/single_file_archive/writer.hpp | 2 +- .../core/src/clp/streaming_archive/writer/Archive.cpp | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index fefb61cd4..1f723cb0c 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -33,7 +33,7 @@ class OperationFailed : public TraceableException { }; /** - * Writes a single-file archive then removes deletes the multi-file archive. + * Writes a single-file archive then deletes the multi-file archive. * * @param multi_file_archive_path * @param next_segment_id ID of the next segment to be created in the archive. diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index dc67c32f5..49fe364a9 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -13,9 +13,7 @@ #include #include #include -#include -#include "../../Defs.h" #include "../../EncodedVariableInterpreter.hpp" #include "../../ir/types.hpp" #include "../../spdlog_with_specializations.hpp" @@ -335,9 +333,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { m_old_ts_pattern = timestamp_pattern; } } - if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts - && false == m_use_single_file_archive) - { + if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { split_file_and_archive( m_archive_user_config, m_path_for_compression, From 05377b85a6582bb9d16dde2046442393d4b74720 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Mon, 10 Feb 2025 22:28:19 +0000 Subject: [PATCH 32/35] haiqi review --- .../single_file_archive/Defs.hpp | 1 + .../single_file_archive/writer.cpp | 77 +++++++++---------- .../single_file_archive/writer.hpp | 9 +-- .../clp/streaming_archive/writer/Archive.cpp | 5 +- 4 files changed, 46 insertions(+), 46 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index 752e04560..ec803f826 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -25,6 +25,7 @@ static constexpr size_t cNumMagicNumberChars{4}; static constexpr std::array cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; static constexpr std::string_view cUnstructuredSfaExtension{".clp"}; +static constexpr std::string_view cFileInfoSentinelName{""}; static constexpr size_t cNumStaticFiles{6}; constexpr std::array cStaticArchiveFileNames{ diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 823f9acdd..b3ff846a1 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -39,12 +39,12 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin * of a list of file names and their corresponding starting offsets. * * @param multi_file_archive_path - * @param next_segment_id + * @param num_segments * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @throws Propagates `update_offset`'s exceptions. + * @throws Propagates `get_file_size_and_update_offset`'s exceptions. */ [[nodiscard]] auto -get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id) +get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) -> std::vector; /** @@ -52,12 +52,12 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ * then serializes the metadata into MsgPack format. * * @param multi_file_archive_path - * @param next_segment_id + * @param num_segments * @return Packed metadata. */ [[nodiscard]] auto pack_single_file_archive_metadata( std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id + size_t num_segments ) -> std::stringstream; /** @@ -70,12 +70,12 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ -> void; /** - * Writes single-file archive metadata. + * Writes packed single-file archive metadata. * * @param single_file_archive_writer * @param packed_metadata Packed metadata. */ -auto write_archive_metadata( +auto write_packed_archive_metadata( FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata ) -> void; @@ -86,7 +86,8 @@ auto write_archive_metadata( * @param single_file_archive_writer * @throws OperationFailed if reading the file fails. */ -auto write_archive_file(std::string const& file_path, FileWriter& single_file_archive_writer) +auto +write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void; /** @@ -95,20 +96,18 @@ auto write_archive_file(std::string const& file_path, FileWriter& single_file_ar * * @param single_file_archive_writer * @param multi_file_archive_path - * @param next_segment_id - * @throws Propagates `update_offset`'s exceptions. + * @param num_segments */ auto write_archive_files( FileWriter& single_file_archive_writer, std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id + size_t num_segments ) -> void; auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) -> void { try { - auto size = std::filesystem::file_size(file_path); - offset += size; + offset += std::filesystem::file_size(file_path); } catch (std::filesystem::filesystem_error const& e) { throw OperationFailed( ErrorCode_Failure, @@ -120,10 +119,10 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin } auto -get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_t next_segment_id) +get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) -> std::vector { std::vector files; - uint64_t offset = 0; + uint64_t offset{}; for (auto const& static_archive_file_name : cStaticArchiveFileNames) { files.emplace_back(FileInfo{std::string(static_archive_file_name), offset}); @@ -132,25 +131,25 @@ get_file_infos(std::filesystem::path const& multi_file_archive_path, segment_id_ std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; - for (size_t i = 0; i < next_segment_id; ++i) { + for (size_t i = 0; i < num_segments; ++i) { auto const segment_id = std::to_string(i); files.emplace_back(FileInfo{segment_id, offset}); get_file_size_and_update_offset(segment_dir_path / segment_id, offset); } // Add sentinel indicating total size of all files. - files.emplace_back(FileInfo{"", offset}); + files.emplace_back(FileInfo{std::string(cFileInfoSentinelName), offset}); return files; } auto pack_single_file_archive_metadata( std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id + size_t num_segments ) -> std::stringstream { SingleFileArchiveMetadata single_file_archive{ - .archive_files = get_file_infos(multi_file_archive_path, next_segment_id), - .num_segments = next_segment_id, + .archive_files = get_archive_file_infos(multi_file_archive_path, num_segments), + .num_segments = num_segments, }; std::stringstream buf; @@ -171,7 +170,7 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } -auto write_archive_metadata( +auto write_packed_archive_metadata( FileWriter& single_file_archive_writer, std::stringstream const& packed_metadata ) -> void { @@ -185,8 +184,7 @@ write_archive_file(std::filesystem::path const& file_path, FileWriter& single_fi std::array read_buffer{}; while (true) { size_t num_bytes_read{}; - ErrorCode const error_code - = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); + auto const error_code = reader.try_read(read_buffer.data(), cReadBlockSize, num_bytes_read); if (ErrorCode_EndOfFile == error_code) { break; } @@ -200,30 +198,29 @@ write_archive_file(std::filesystem::path const& file_path, FileWriter& single_fi auto write_archive_files( FileWriter& single_file_archive_writer, std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id + segment_id_t num_segments ) -> void { for (auto const& static_archive_file_name : cStaticArchiveFileNames) { - std::filesystem::path static_archive_file_path - = multi_file_archive_path / static_archive_file_name; + auto const static_archive_file_path{multi_file_archive_path / static_archive_file_name}; write_archive_file(static_archive_file_path, single_file_archive_writer); } - std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; - for (size_t i = 0; i < next_segment_id; ++i) { - std::filesystem::path segment_path = segment_dir_path / std::to_string(i); + auto const segment_dir_path{multi_file_archive_path / cSegmentsDirname}; + for (size_t i = 0; i < num_segments; ++i) { + auto const segment_path{segment_dir_path / std::to_string(i)}; write_archive_file(segment_path, single_file_archive_writer); } } } // namespace -auto write_single_file_archive( - std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id -) -> void { +auto +write_single_file_archive(std::filesystem::path const& multi_file_archive_path, size_t num_segments) + -> void { FileWriter single_file_archive_writer; - std::filesystem::path single_file_archive_path - = multi_file_archive_path.string() - + std::string(single_file_archive::cUnstructuredSfaExtension); + std::filesystem::path single_file_archive_path{ + multi_file_archive_path.string() + + std::string(single_file_archive::cUnstructuredSfaExtension) + }; if (std::filesystem::exists(single_file_archive_path)) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); @@ -235,17 +232,17 @@ auto write_single_file_archive( ); auto const packed_metadata - = pack_single_file_archive_metadata(multi_file_archive_path, next_segment_id); + = pack_single_file_archive_metadata(multi_file_archive_path, num_segments); write_archive_header(single_file_archive_writer, packed_metadata.str().size()); - write_archive_metadata(single_file_archive_writer, packed_metadata); - write_archive_files(single_file_archive_writer, multi_file_archive_path, next_segment_id); + write_packed_archive_metadata(single_file_archive_writer, packed_metadata); + write_archive_files(single_file_archive_writer, multi_file_archive_path, num_segments); single_file_archive_writer.close(); try { std::filesystem::remove_all(multi_file_archive_path); } catch (std::filesystem::filesystem_error& e) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + SPDLOG_WARN("Failed to delete multi-file archive: {}", e.what()); } } } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp index 1f723cb0c..85f234ca4 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.hpp @@ -36,13 +36,12 @@ class OperationFailed : public TraceableException { * Writes a single-file archive then deletes the multi-file archive. * * @param multi_file_archive_path - * @param next_segment_id ID of the next segment to be created in the archive. + * @param num_segments * @throws OperationFailed if single-file archive path already exists. */ -auto write_single_file_archive( - std::filesystem::path const& multi_file_archive_path, - segment_id_t next_segment_id -) -> void; +auto +write_single_file_archive(std::filesystem::path const& multi_file_archive_path, size_t num_segments) + -> void; } // namespace clp::streaming_archive::single_file_archive diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 49fe364a9..34b8f9b48 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -245,7 +245,10 @@ void Archive::close() { m_metadata_db.close(); if (m_use_single_file_archive) { - single_file_archive::write_single_file_archive(m_path, m_next_segment_id); + single_file_archive::write_single_file_archive( + m_path, + static_cast(m_next_segment_id) + ); } m_creator_id_as_string.clear(); From e0fcefec1e020c57ee66d480cde171fde2d050ff Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Mon, 10 Feb 2025 22:39:38 +0000 Subject: [PATCH 33/35] small fix --- .../src/clp/streaming_archive/single_file_archive/writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index b3ff846a1..8fc4bd0ee 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -198,7 +198,7 @@ write_archive_file(std::filesystem::path const& file_path, FileWriter& single_fi auto write_archive_files( FileWriter& single_file_archive_writer, std::filesystem::path const& multi_file_archive_path, - segment_id_t num_segments + size_t num_segments ) -> void { for (auto const& static_archive_file_name : cStaticArchiveFileNames) { auto const static_archive_file_path{multi_file_archive_path / static_archive_file_name}; From d8a3c706120cf5d8ce1e3eaaa757a0c9fcd06587 Mon Sep 17 00:00:00 2001 From: Dave Marco Date: Tue, 11 Feb 2025 15:28:11 +0000 Subject: [PATCH 34/35] haiqi review remove offset function --- .../single_file_archive/writer.cpp | 30 ++----------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 8fc4bd0ee..11de02b37 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -24,16 +24,6 @@ namespace clp::streaming_archive::single_file_archive { namespace { constexpr size_t cReadBlockSize = 4096; -/** - * Gets the size of a file specified by `file_path` and adds it to file section `offset`. - * @param file_path - * @param[out] offset File section offset for the single-file archive. The returned offset - * represents the starting position of the next file in single-file archive. - * @throws OperationFailed if error getting file size. - */ -auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) - -> void; - /** * Generates metadata for the file section of a single-file archive. The metadata consists * of a list of file names and their corresponding starting offsets. @@ -41,7 +31,7 @@ auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uin * @param multi_file_archive_path * @param num_segments * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @throws Propagates `get_file_size_and_update_offset`'s exceptions. + * @throws `std::filesystem::filesystem_error` if `stat` on archive file fails. */ [[nodiscard]] auto get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) @@ -104,20 +94,6 @@ auto write_archive_files( size_t num_segments ) -> void; -auto get_file_size_and_update_offset(std::filesystem::path const& file_path, uint64_t& offset) - -> void { - try { - offset += std::filesystem::file_size(file_path); - } catch (std::filesystem::filesystem_error const& e) { - throw OperationFailed( - ErrorCode_Failure, - __FILENAME__, - __LINE__, - fmt::format("Failed to get file size: {}", e.what()) - ); - } -} - auto get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) -> std::vector { @@ -126,7 +102,7 @@ get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, siz for (auto const& static_archive_file_name : cStaticArchiveFileNames) { files.emplace_back(FileInfo{std::string(static_archive_file_name), offset}); - get_file_size_and_update_offset(multi_file_archive_path / static_archive_file_name, offset); + offset += std::filesystem::file_size(multi_file_archive_path / static_archive_file_name); } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; @@ -134,7 +110,7 @@ get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, siz for (size_t i = 0; i < num_segments; ++i) { auto const segment_id = std::to_string(i); files.emplace_back(FileInfo{segment_id, offset}); - get_file_size_and_update_offset(segment_dir_path / segment_id, offset); + offset += std::filesystem::file_size(segment_dir_path / segment_id); } // Add sentinel indicating total size of all files. From 5a0ec32be1fcefc7f6b7def6c59a5f75464e849c Mon Sep 17 00:00:00 2001 From: davemarco <83603688+davemarco@users.noreply.github.com> Date: Tue, 18 Feb 2025 10:40:21 -0500 Subject: [PATCH 35/35] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../core/src/clp/clp/CommandLineArguments.cpp | 2 +- .../single_file_archive/Defs.hpp | 6 +++--- .../single_file_archive/writer.cpp | 19 ++++++++----------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index 266d1be62..679734b11 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -376,7 +376,7 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { )( "single-file-archive", po::bool_switch(&m_single_file_archive), - "Output archive as a single-file archive" + "Output archive as a single-file" ); po::options_description all_compression_options; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp index ec803f826..6866bf676 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp @@ -10,10 +10,9 @@ #include "../Constants.hpp" namespace clp::streaming_archive::single_file_archive { - using single_file_archive_format_version_t = uint32_t; -// Single file archive version. +// Single file archive version constexpr uint8_t cVersionMajor{0}; constexpr uint8_t cVersionMinor{2}; constexpr uint16_t cVersionPatch{0}; @@ -49,7 +48,8 @@ struct __attribute__((packed)) SingleFileArchiveHeader { struct FileInfo { std::string name; uint64_t offset; - // Variables are renamed when serialized to match single-file archive specification. + + // Variables are renamed when serialized to match single-file archive specification MSGPACK_DEFINE_MAP(MSGPACK_NVP("n", name), MSGPACK_NVP("o", offset)); }; diff --git a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp index 11de02b37..b37b978f7 100644 --- a/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp +++ b/components/core/src/clp/streaming_archive/single_file_archive/writer.cpp @@ -22,7 +22,7 @@ namespace clp::streaming_archive::single_file_archive { namespace { -constexpr size_t cReadBlockSize = 4096; +constexpr size_t cReadBlockSize{4096}; /** * Generates metadata for the file section of a single-file archive. The metadata consists @@ -31,7 +31,7 @@ constexpr size_t cReadBlockSize = 4096; * @param multi_file_archive_path * @param num_segments * @return Vector containing a `FileInfo` struct for every file in the multi-file archive. - * @throws `std::filesystem::filesystem_error` if `stat` on archive file fails. + * @throws `std::filesystem::filesystem_error` if `stat` on an archive file fails. */ [[nodiscard]] auto get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) @@ -39,7 +39,7 @@ get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, siz /** * Combines file section metadata and the number of segments into single-file archive metadata, - * then serializes the metadata into MsgPack format. + * then serializes the metadata into the MessagePack format. * * @param multi_file_archive_path * @param num_segments @@ -71,7 +71,7 @@ auto write_packed_archive_metadata( ) -> void; /** - * Reads the content of a file and writes it to the single-file archive. + * Reads the content of a file and writes it to the given file writer. * @param file_path * @param single_file_archive_writer * @throws OperationFailed if reading the file fails. @@ -98,22 +98,21 @@ auto get_archive_file_infos(std::filesystem::path const& multi_file_archive_path, size_t num_segments) -> std::vector { std::vector files; - uint64_t offset{}; + uint64_t offset{}; for (auto const& static_archive_file_name : cStaticArchiveFileNames) { files.emplace_back(FileInfo{std::string(static_archive_file_name), offset}); offset += std::filesystem::file_size(multi_file_archive_path / static_archive_file_name); } std::filesystem::path segment_dir_path = multi_file_archive_path / cSegmentsDirname; - for (size_t i = 0; i < num_segments; ++i) { auto const segment_id = std::to_string(i); files.emplace_back(FileInfo{segment_id, offset}); offset += std::filesystem::file_size(segment_dir_path / segment_id); } - // Add sentinel indicating total size of all files. + // Add sentinel indicating total size of all files files.emplace_back(FileInfo{std::string(cFileInfoSentinelName), offset}); return files; @@ -142,7 +141,6 @@ auto write_archive_header(FileWriter& single_file_archive_writer, size_t packed_ .metadata_size = packed_metadata_size, .unused{} }; - single_file_archive_writer.write(reinterpret_cast(&header), sizeof(header)); } @@ -156,7 +154,7 @@ auto write_packed_archive_metadata( auto write_archive_file(std::filesystem::path const& file_path, FileWriter& single_file_archive_writer) -> void { - FileReader reader(file_path.string()); + FileReader reader{file_path.string()}; std::array read_buffer{}; while (true) { size_t num_bytes_read{}; @@ -197,7 +195,6 @@ write_single_file_archive(std::filesystem::path const& multi_file_archive_path, multi_file_archive_path.string() + std::string(single_file_archive::cUnstructuredSfaExtension) }; - if (std::filesystem::exists(single_file_archive_path)) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } @@ -209,12 +206,12 @@ write_single_file_archive(std::filesystem::path const& multi_file_archive_path, auto const packed_metadata = pack_single_file_archive_metadata(multi_file_archive_path, num_segments); - write_archive_header(single_file_archive_writer, packed_metadata.str().size()); write_packed_archive_metadata(single_file_archive_writer, packed_metadata); write_archive_files(single_file_archive_writer, multi_file_archive_path, num_segments); single_file_archive_writer.close(); + try { std::filesystem::remove_all(multi_file_archive_path); } catch (std::filesystem::filesystem_error& e) {