From 11443e62dea2a1820a63d05075bfde6f4c235778 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 20 Jun 2024 20:53:09 -0400 Subject: [PATCH 1/5] Fix libarchive file reader --- .../core/src/clp/LibarchiveFileReader.cpp | 44 ++++++++++--------- .../core/src/clp/LibarchiveFileReader.hpp | 38 +++++++--------- .../core/src/clp/clp/FileCompressor.cpp | 16 ++++--- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/components/core/src/clp/LibarchiveFileReader.cpp b/components/core/src/clp/LibarchiveFileReader.cpp index 6ba0e980d..e71db237e 100644 --- a/components/core/src/clp/LibarchiveFileReader.cpp +++ b/components/core/src/clp/LibarchiveFileReader.cpp @@ -38,7 +38,7 @@ LibarchiveFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_ while (true) { // Read a data block if necessary if (nullptr == m_data_block) { - auto error_code = read_next_data_block(); + auto error_code = read_next_nonempty_data_block(); if (ErrorCode_Success != error_code) { if (ErrorCode_EndOfFile == error_code && num_bytes_read > 0) { return ErrorCode_Success; @@ -111,7 +111,7 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter( while (true) { // Read a data block if necessary if (nullptr == m_data_block) { - auto error_code = read_next_data_block(); + auto error_code = read_next_nonempty_data_block(); if (ErrorCode_Success != error_code) { if (ErrorCode_EndOfFile == error_code && str.length() > original_str_length) { // NOTE: At this point, we haven't found delim, so return directly without @@ -206,7 +206,7 @@ void LibarchiveFileReader::close() { m_pos_in_file = 0; } -ErrorCode LibarchiveFileReader::try_load_data_block() { +ErrorCode LibarchiveFileReader::try_load_nonempty_data_block() { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -217,10 +217,12 @@ ErrorCode LibarchiveFileReader::try_load_data_block() { if (m_data_block != nullptr) { return ErrorCode_Success; } - return read_next_data_block(); + return read_next_nonempty_data_block(); } -void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size) const { +auto LibarchiveFileReader::peek_buffered_data() const -> std::span { + char const* buf{nullptr}; + size_t buf_size{}; if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -245,21 +247,25 @@ void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size buf_size = m_data_block_length - m_pos_in_data_block; buf = static_cast(m_data_block); } + return {buf, buf_size}; } -ErrorCode LibarchiveFileReader::read_next_data_block() { - auto return_value = archive_read_data_block( - m_archive, - &m_data_block, - &m_data_block_length, - &m_data_block_pos_in_file - ); - if (ARCHIVE_OK != return_value) { - m_data_block = nullptr; - if (ARCHIVE_EOF == return_value) { - m_reached_eof = true; - return ErrorCode_EndOfFile; - } else { +ErrorCode LibarchiveFileReader::read_next_nonempty_data_block() { + m_data_block_length = 0; + m_pos_in_data_block = 0; + while (0 == m_data_block_length) { + auto return_value = archive_read_data_block( + m_archive, + &m_data_block, + &m_data_block_length, + &m_data_block_pos_in_file + ); + if (ARCHIVE_OK != return_value) { + m_data_block = nullptr; + if (ARCHIVE_EOF == return_value) { + m_reached_eof = true; + return ErrorCode_EndOfFile; + } SPDLOG_DEBUG( "Failed to read data block from libarchive - {}", archive_error_string(m_archive) @@ -268,8 +274,6 @@ ErrorCode LibarchiveFileReader::read_next_data_block() { } } - m_pos_in_data_block = 0; - return ErrorCode_Success; } } // namespace clp diff --git a/components/core/src/clp/LibarchiveFileReader.hpp b/components/core/src/clp/LibarchiveFileReader.hpp index 2b7d17fad..b29e8415c 100644 --- a/components/core/src/clp/LibarchiveFileReader.hpp +++ b/components/core/src/clp/LibarchiveFileReader.hpp @@ -2,6 +2,7 @@ #define CLP_LIBARCHIVEFILEREADER_HPP #include +#include #include #include @@ -30,13 +31,7 @@ class LibarchiveFileReader : public ReaderInterface { }; // Constructors - LibarchiveFileReader() - : m_archive(nullptr), - m_archive_entry(nullptr), - m_data_block(nullptr), - m_reached_eof(false), - m_data_block_pos_in_file(0), - m_pos_in_file(0) {} + LibarchiveFileReader() = default; // Methods implementing the ReaderInterface /** @@ -89,43 +84,42 @@ class LibarchiveFileReader : public ReaderInterface { void close(); /** - * Tries to the load a data block from the file if none is loaded + * Tries to the load a nonempty data block from the file if none is loaded * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_load_data_block(); + [[nodiscard]] ErrorCode try_load_nonempty_data_block(); /** * Peeks the remaining buffered content without advancing the read head. * * NOTE: Any subsequent read or seek operations may invalidate the returned buffer. - * @param buf Returns a pointer to any buffered data - * @param buf_size Returns the number of bytes in the buffer + * @return The view of the buffered data */ - void peek_buffered_data(char const*& buf, size_t& buf_size) const; + [[nodiscard]] auto peek_buffered_data() const -> std::span; private: // Methods /** - * Reads next data block from the archive + * Reads next nonempty data block from the archive * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - ErrorCode read_next_data_block(); + ErrorCode read_next_nonempty_data_block(); // Variables - struct archive* m_archive; + struct archive* m_archive{nullptr}; - struct archive_entry* m_archive_entry; - la_int64_t m_data_block_pos_in_file; - void const* m_data_block; - size_t m_data_block_length; - la_int64_t m_pos_in_data_block; - bool m_reached_eof; + struct archive_entry* m_archive_entry{nullptr}; + la_int64_t m_data_block_pos_in_file{0}; + void const* m_data_block{nullptr}; + size_t m_data_block_length{0}; + la_int64_t m_pos_in_data_block{0}; + bool m_reached_eof{false}; - size_t m_pos_in_file; + size_t m_pos_in_file{0}; // Nulls for peek std::array m_nulls_for_peek{0}; diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 4100816f5..949a677be 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -343,7 +344,7 @@ bool FileCompressor::try_compressing_as_archive( m_libarchive_reader.open_file_reader(m_libarchive_file_reader); // Check that file is UTF-8 encoded - if (auto error_code = m_libarchive_file_reader.try_load_data_block(); + if (auto error_code = m_libarchive_file_reader.try_load_nonempty_data_block(); ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { SPDLOG_ERROR( @@ -355,12 +356,10 @@ bool FileCompressor::try_compressing_as_archive( succeeded = false; continue; } - char const* utf8_validation_buf{nullptr}; - size_t peek_size{0}; - m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size); + auto const utf8_validation_buf{m_libarchive_file_reader.peek_buffered_data()}; string file_path{m_libarchive_reader.get_path()}; - auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen); - if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { + auto utf8_validation_buf_len = std::min(utf8_validation_buf.size(), cUtfMaxValidationLen); + if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf.data())) { auto boost_path_for_compression = parent_boost_path / file_path; if (use_heuristic) { parse_and_encode_with_heuristic( @@ -383,7 +382,10 @@ bool FileCompressor::try_compressing_as_archive( m_libarchive_file_reader ); } - } else if (has_ir_stream_magic_number({utf8_validation_buf, peek_size})) { + } else if (has_ir_stream_magic_number( + {utf8_validation_buf.data(), utf8_validation_buf.size()} + )) + { // Remove .clp suffix if found static constexpr char cIrStreamExtension[] = ".clp"; if (boost::iends_with(file_path, cIrStreamExtension)) { From bcf13bd6773596c7b038226e58a09e9c4c5f733b Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 20 Jun 2024 20:56:27 -0400 Subject: [PATCH 2/5] Minor refactoring --- components/core/src/clp/LibarchiveFileReader.cpp | 4 ++-- components/core/src/clp/LibarchiveFileReader.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/LibarchiveFileReader.cpp b/components/core/src/clp/LibarchiveFileReader.cpp index e71db237e..24ead0bc7 100644 --- a/components/core/src/clp/LibarchiveFileReader.cpp +++ b/components/core/src/clp/LibarchiveFileReader.cpp @@ -206,7 +206,7 @@ void LibarchiveFileReader::close() { m_pos_in_file = 0; } -ErrorCode LibarchiveFileReader::try_load_nonempty_data_block() { +auto LibarchiveFileReader::try_load_nonempty_data_block() -> ErrorCode { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -250,7 +250,7 @@ auto LibarchiveFileReader::peek_buffered_data() const -> std::span { return {buf, buf_size}; } -ErrorCode LibarchiveFileReader::read_next_nonempty_data_block() { +auto LibarchiveFileReader::read_next_nonempty_data_block() -> ErrorCode { m_data_block_length = 0; m_pos_in_data_block = 0; while (0 == m_data_block_length) { diff --git a/components/core/src/clp/LibarchiveFileReader.hpp b/components/core/src/clp/LibarchiveFileReader.hpp index b29e8415c..f357ecc1b 100644 --- a/components/core/src/clp/LibarchiveFileReader.hpp +++ b/components/core/src/clp/LibarchiveFileReader.hpp @@ -89,7 +89,7 @@ class LibarchiveFileReader : public ReaderInterface { * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_load_nonempty_data_block(); + [[nodiscard]] auto try_load_nonempty_data_block() -> ErrorCode; /** * Peeks the remaining buffered content without advancing the read head. @@ -107,7 +107,7 @@ class LibarchiveFileReader : public ReaderInterface { * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - ErrorCode read_next_nonempty_data_block(); + [[nodiscard]] auto read_next_nonempty_data_block() -> ErrorCode; // Variables struct archive* m_archive{nullptr}; From 7ad6884a5a0cf5542a85d243dbaf267a24c5a05b Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 05:55:41 -0400 Subject: [PATCH 3/5] Code review --- .../core/src/clp/LibarchiveFileReader.cpp | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/components/core/src/clp/LibarchiveFileReader.cpp b/components/core/src/clp/LibarchiveFileReader.cpp index 24ead0bc7..25e0eba7f 100644 --- a/components/core/src/clp/LibarchiveFileReader.cpp +++ b/components/core/src/clp/LibarchiveFileReader.cpp @@ -1,6 +1,7 @@ #include "LibarchiveFileReader.hpp" #include +#include #include "spdlog_with_specializations.hpp" @@ -221,8 +222,6 @@ auto LibarchiveFileReader::try_load_nonempty_data_block() -> ErrorCode { } auto LibarchiveFileReader::peek_buffered_data() const -> std::span { - char const* buf{nullptr}; - size_t buf_size{}; if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -230,24 +229,22 @@ auto LibarchiveFileReader::peek_buffered_data() const -> std::span { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } if (nullptr == m_data_block) { - buf_size = 0; - buf = nullptr; - } else if (m_pos_in_file < m_data_block_pos_in_file) { + return {static_cast(nullptr), 0}; + } + if (m_pos_in_file < m_data_block_pos_in_file) { // Position in the file is before the current data block, so we return nulls corresponding // to the sparse bytes before the data block // NOTE: We don't return ALL sparse bytes before the data block since that might require // allocating more bytes, violating the const-ness of this method. Since peek is a // best-effort method, this should be sufficient for most callers. - buf = m_nulls_for_peek.data(); - buf_size = std::min( - m_nulls_for_peek.size(), - static_cast(m_data_block_pos_in_file - m_pos_in_file) - ); - } else { - buf_size = m_data_block_length - m_pos_in_data_block; - buf = static_cast(m_data_block); + return {m_nulls_for_peek.data(), + std::min( + m_nulls_for_peek.size(), + static_cast(m_data_block_pos_in_file - m_pos_in_file) + )}; } - return {buf, buf_size}; + + return {static_cast(m_data_block), m_data_block_length - m_pos_in_data_block}; } auto LibarchiveFileReader::read_next_nonempty_data_block() -> ErrorCode { From be6eecd098afd235f558f7a1851fafa84cfefb8f Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 06:08:28 -0400 Subject: [PATCH 4/5] Fix failed build --- components/core/src/clp/LibarchiveFileReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/LibarchiveFileReader.cpp b/components/core/src/clp/LibarchiveFileReader.cpp index 25e0eba7f..404b2f722 100644 --- a/components/core/src/clp/LibarchiveFileReader.cpp +++ b/components/core/src/clp/LibarchiveFileReader.cpp @@ -229,7 +229,7 @@ auto LibarchiveFileReader::peek_buffered_data() const -> std::span { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } if (nullptr == m_data_block) { - return {static_cast(nullptr), 0}; + return {static_cast(m_data_block), 0}; } if (m_pos_in_file < m_data_block_pos_in_file) { // Position in the file is before the current data block, so we return nulls corresponding From 0899f455d299243ce8aa5e01a8a73b085df273fc Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 07:07:50 -0400 Subject: [PATCH 5/5] ... --- components/core/src/clp/LibarchiveFileReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/LibarchiveFileReader.cpp b/components/core/src/clp/LibarchiveFileReader.cpp index 404b2f722..ff2059634 100644 --- a/components/core/src/clp/LibarchiveFileReader.cpp +++ b/components/core/src/clp/LibarchiveFileReader.cpp @@ -229,7 +229,7 @@ auto LibarchiveFileReader::peek_buffered_data() const -> std::span { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } if (nullptr == m_data_block) { - return {static_cast(m_data_block), 0}; + return {}; } if (m_pos_in_file < m_data_block_pos_in_file) { // Position in the file is before the current data block, so we return nulls corresponding