From dbbfd2717ef46d6c05ee24dea37aed0df2c43891 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Apr 2024 13:56:05 +0200 Subject: [PATCH] Add dictionary encoded layout (#56) --- .gitignore | 3 + include/sparrow/array_data.hpp | 1 + include/sparrow/dictionary_encoded_layout.hpp | 344 ++++++++++++++++++ include/sparrow/mp_utils.hpp | 15 + .../sparrow/variable_size_binary_layout.hpp | 21 +- test/CMakeLists.txt | 1 + test/test_dictionary_encoded_layout.cpp | 203 +++++++++++ 7 files changed, 569 insertions(+), 19 deletions(-) create mode 100644 include/sparrow/dictionary_encoded_layout.hpp create mode 100644 test/test_dictionary_encoded_layout.cpp diff --git a/.gitignore b/.gitignore index 4e02d907..aeb2cfc0 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,6 @@ # CTest directory /Testing/ + +# Clangd cache +.cache diff --git a/include/sparrow/array_data.hpp b/include/sparrow/array_data.hpp index c4e3a88e..38e9a9ed 100644 --- a/include/sparrow/array_data.hpp +++ b/include/sparrow/array_data.hpp @@ -48,6 +48,7 @@ namespace sparrow // Other buffers std::vector buffers; std::vector child_data; + std::shared_ptr dictionary; }; /** diff --git a/include/sparrow/dictionary_encoded_layout.hpp b/include/sparrow/dictionary_encoded_layout.hpp new file mode 100644 index 00000000..c3ddc55d --- /dev/null +++ b/include/sparrow/dictionary_encoded_layout.hpp @@ -0,0 +1,344 @@ +// Copyright 2024 Man Group Operations Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "sparrow/array_data.hpp" +#include "sparrow/iterator.hpp" +#include "sparrow/fixed_size_layout.hpp" +#include "sparrow/mp_utils.hpp" +#include "sparrow/iterator.hpp" + +namespace sparrow +{ + /** + * @class dictionary_value_iterator + * + * @brief Iterator over the data values of a dictionary layout. + * + * @tparam IL the layout type of the indexes. + * @tparam SL the layout type of the dictionary. + * @tparam is_const a boolean flag specifying whether this iterator is const. + */ + template + class dictionary_value_iterator : public iterator_base< + dictionary_value_iterator, + typename SL::value_type, + std::random_access_iterator_tag, + typename SL::const_reference> + { + public: + + using self_type = dictionary_value_iterator; + using base_type = iterator_base< + self_type, + typename SL::value_type, + std::random_access_iterator_tag, + typename SL::const_reference>; + using reference = typename base_type::reference; + using difference_type = typename base_type::difference_type; + + using index_iterator = std::conditional_t; + using sub_layout = mpl::constify_t; + using sub_layout_reference = sub_layout&; + + // `dictionary_value_iterator` needs to be default constructible + // to satisfy `dictionary_encoded_layout::const_value_range`'s + // constraints. + dictionary_value_iterator() noexcept = default; + dictionary_value_iterator(index_iterator index_it, sub_layout_reference sub_layout_reference); + + private: + reference dereference() const; + void increment(); + void decrement(); + void advance(difference_type n); + difference_type distance_to(const self_type& rhs) const; + bool equal(const self_type& rhs) const; + bool less_than(const self_type& rhs) const; + + index_iterator m_index_it; + // Use std::optional because of default constructor. + std::optional> m_sub_layout_reference; + + friend class iterator_access; + }; + + /* + * @class dictionary_encoded_layout + * + * @brief Layout for arrays containing many duplicated values. + * + * Dictionary encoding is a data representation technique to represent values by + * integers referencing a dictionary usually consisting of unique values. It can + * be effective when you have data with many repeated values. + * + * Example: + * + * data VarBinary (dictionary-encoded) + * index_type: Int32 + * values: [0, 1, 3, 1, 4, 2] + * + * dictionary + * type: VarBinary + * values: ['foo', 'bar', 'baz', 'foo', null] + * + * Traversing the values will give you the following: + * 'foo', 'bar', 'foo', 'bar', null, 'baz' + * + * @tparam IT the type of the index. Must be an integral. + * @tparam SL the layout type of the dictionary. + * @tparam OT type of the offset values. Must be std::int64_t or std::int32_t. + */ + template + class dictionary_encoded_layout + { + public: + using self_type = dictionary_encoded_layout; + using index_type = IT; + using inner_value_type = SL::inner_value_type; + using sub_layout = SL; + using inner_reference = reference_proxy; + using inner_const_reference = const_reference_proxy; + using bitmap_type = array_data::bitmap_type; + using bitmap_const_reference = bitmap_type::const_reference; + using value_type = SL::value_type; + using reference = reference_proxy; + using const_reference = const_reference_proxy; + using size_type = std::size_t; + using indexes_layout = fixed_size_layout; + using iterator_tag = std::random_access_iterator_tag; + + /** + * These types have to be public to be accessible when + * instantiating const_value_iterator for checking the + * requirements of subrange. + */ + using data_type = IT; + + using offset_iterator = OT*; + using const_offset_iterator = const OT*; + + using data_iterator = data_type*; + using const_data_iterator = const data_type*; + + // TODO: implement the iterator once #35 is merged + // using iterator = layout_iterator; + using const_iterator = layout_iterator; + + using bitmap_iterator = indexes_layout::bitmap_iterator; + using const_bitmap_iterator = indexes_layout::const_bitmap_iterator; + using const_bitmap_range = indexes_layout::const_bitmap_range; + + using value_iterator = dictionary_value_iterator; + using const_value_iterator = dictionary_value_iterator; + using const_value_range = std::ranges::subrange; + + explicit dictionary_encoded_layout(array_data&& data); + explicit dictionary_encoded_layout(const array_data& data); // TODO: To remove when #51 will be merged + + size_type size() const; + const_reference operator[](size_type i) const; + + const_iterator cbegin() const; + const_iterator cend() const; + + const_bitmap_range bitmap() const; + const_value_range values() const; + + private: + const indexes_layout& get_const_indexes_layout() const; + + const_value_iterator value_cbegin() const; + const_value_iterator value_cend() const; + + inner_const_reference value(size_type i) const; + + const_offset_iterator offset(size_type i) const; + const_offset_iterator offset_end() const; + const_data_iterator data(size_type i) const; + + std::unique_ptr m_indexes_layout; + std::unique_ptr m_sub_layout; + + static const const_reference& dummy_const_reference(){ + static const typename sub_layout::inner_value_type dummy_inner_value; + static const typename sub_layout::bitmap_type dummy_bitmap(1, false); + static const const_reference instance(dummy_inner_value, dummy_bitmap[0]); + return instance; + } + + friend class const_reference_proxy; + friend class dictionary_value_iterator; + }; + + /******************************************* + * vs_binary_value_iterator implementation * + *******************************************/ + + template + dictionary_value_iterator::dictionary_value_iterator(index_iterator index_it, sub_layout_reference sub_layout_reference) + : m_index_it(index_it) + , m_sub_layout_reference(sub_layout_reference) + { + } + + template + auto dictionary_value_iterator::dereference() const -> reference + { + assert(m_sub_layout_reference.has_value()); + return (*m_sub_layout_reference).get()[*m_index_it]; + } + + template + void dictionary_value_iterator::increment() + { + ++m_index_it; + } + + template + void dictionary_value_iterator::decrement() + { + --m_index_it; + } + + template + void dictionary_value_iterator::advance(difference_type n) + { + m_index_it += n; + } + + template + auto dictionary_value_iterator::distance_to(const self_type& rhs) const -> difference_type + { + m_index_it.distance_to(rhs.m_index_it); + } + + template + bool dictionary_value_iterator::equal(const self_type& rhs) const + { + return m_index_it == rhs.m_index_it; + } + + template + bool dictionary_value_iterator::less_than(const self_type& rhs) const + { + return m_index_it < rhs.m_index_it; + } + + /********************************************** + * dictionary_encoded_layout implementation * + **********************************************/ + + template + dictionary_encoded_layout::dictionary_encoded_layout(const array_data& data) + { + assert(data.dictionary); + m_sub_layout = std::make_unique(*data.dictionary); + m_indexes_layout = std::make_unique(data); + } + + template + dictionary_encoded_layout::dictionary_encoded_layout(array_data&& data) + { + assert(data.dictionary); + m_sub_layout = std::make_unique(std::move(*data.dictionary)); + m_indexes_layout = std::make_unique(std::move(data)); + } + + template + auto dictionary_encoded_layout::size() const -> size_type + { + return m_indexes_layout->size(); + } + + template + auto dictionary_encoded_layout::operator[](size_type i) const -> const_reference + { + assert(i < size()); + const auto index = (*m_indexes_layout)[i]; + if (index.has_value()) { + return (*m_sub_layout)[index.value()]; + } + else { + return dummy_const_reference(); + } + } + + template + auto dictionary_encoded_layout::bitmap() const -> const_bitmap_range + { + return get_const_indexes_layout().bitmap(); + } + + template + const typename dictionary_encoded_layout::indexes_layout& dictionary_encoded_layout::get_const_indexes_layout() const + { + return *const_cast(m_indexes_layout.get()); + } + + template + auto dictionary_encoded_layout::cbegin() const -> const_iterator + { + return const_iterator(value_cbegin(), get_const_indexes_layout().bitmap().begin()); + } + + template + auto dictionary_encoded_layout::cend() const -> const_iterator + { + return const_iterator(value_cend(), get_const_indexes_layout().bitmap().end()); + } + + template + auto dictionary_encoded_layout::value_cbegin() const -> const_value_iterator + { + return const_value_iterator(get_const_indexes_layout().values().begin(), *m_sub_layout); + } + + template + auto dictionary_encoded_layout::value_cend() const -> const_value_iterator + { + return const_value_iterator(get_const_indexes_layout().values().end(), *m_sub_layout); + } + + template + auto dictionary_encoded_layout::values() const -> const_value_range + { + return const_value_range(value_cbegin(), value_cend()); + } + + template + auto dictionary_encoded_layout::value(size_type i) const -> inner_const_reference + { + return inner_const_reference(data(*offset(i)), data(*offset(i + 1))); + } + + template + auto dictionary_encoded_layout::offset(size_type i) const -> const_offset_iterator + { + return m_indexes_layout->offset(i); + } + + template + auto dictionary_encoded_layout::offset_end() const -> const_offset_iterator + { + return m_indexes_layout->offset_end(); + } + + template + auto dictionary_encoded_layout::data(size_type i) const -> const_data_iterator + { + return m_sub_layout->data(i); + } +} // namespace sparrow diff --git a/include/sparrow/mp_utils.hpp b/include/sparrow/mp_utils.hpp index 70260c34..33da3812 100644 --- a/include/sparrow/mp_utils.hpp +++ b/include/sparrow/mp_utils.hpp @@ -29,4 +29,19 @@ namespace sparrow template using constify_t = typename constify::type; } + + namespace impl + { + template + struct get_inner_reference + : std::conditional + { + }; + + template + using get_inner_reference_t = typename get_inner_reference::type; + } // namespace impl + + template + concept layout_offset = std::same_as || std::same_as; } diff --git a/include/sparrow/variable_size_binary_layout.hpp b/include/sparrow/variable_size_binary_layout.hpp index 63d7e4e2..29b6b7a7 100644 --- a/include/sparrow/variable_size_binary_layout.hpp +++ b/include/sparrow/variable_size_binary_layout.hpp @@ -15,30 +15,13 @@ #pragma once #include + +#include "sparrow/mp_utils.hpp" #include "sparrow/array_data.hpp" #include "sparrow/iterator.hpp" namespace sparrow { - namespace impl - { - template - struct get_inner_reference - : std::conditional< - is_const, - typename C::inner_const_reference, - typename C::inner_reference - > - { - }; - - template - using get_inner_reference_t = typename get_inner_reference::type; - } - - template - concept layout_offset = std::same_as || std::same_as; - /** * @class vs_binary_value_iterator * diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 82e9d051..c7b7caca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -41,6 +41,7 @@ set(SPARROW_TESTS_SOURCES test_iterator.cpp test_layout.cpp test_variable_size_binary_layout.cpp + test_dictionary_encoded_layout.cpp ) set(test_target "test_sparrow_lib") add_executable(${test_target} ${SPARROW_TESTS_SOURCES}) diff --git a/test/test_dictionary_encoded_layout.cpp b/test/test_dictionary_encoded_layout.cpp new file mode 100644 index 00000000..f8126300 --- /dev/null +++ b/test/test_dictionary_encoded_layout.cpp @@ -0,0 +1,203 @@ +// Copyright 2024 Man Group Operations Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "doctest/doctest.h" + +#include +#include +#include +#include +#include +#include // For doctest + +#include "sparrow/variable_size_binary_layout.hpp" +#include "sparrow/dictionary_encoded_layout.hpp" + +using data_type_t = uint8_t; +constexpr size_t element_count = 10; +static const std::array indexes {1, 0,3, 0, 1, 2, 3, 2, 4, 2}; + +namespace sparrow +{ + struct dictionary_encoded_fixture + { + dictionary_encoded_fixture() + { + m_data.type = data_descriptor(data_type::UINT8); + m_data.bitmap = dynamic_bitset(element_count, true); + m_data.bitmap.set(9, false); + constexpr size_t buffer_size = (element_count * sizeof(data_type_t)) / sizeof(uint8_t); + buffer b(buffer_size); + std::ranges::copy(indexes,b.data()); + m_data.buffers.push_back(b); + m_data.length = element_count; + auto dictionary = make_dictionary(); + m_data.dictionary = std::make_shared(std::move(dictionary)); + } + + static array_data make_dictionary() + { + array_data dictionary; + dictionary.bitmap.resize(words.size()); + dictionary.buffers.resize(2); + dictionary.buffers[0].resize(sizeof(std::int64_t) * (words.size() + 1)); + dictionary.buffers[1].resize(std::accumulate( + words.cbegin(), words.cend(), size_t(0), [](std::size_t res, const auto& s) { return res + s.size(); } + )); + dictionary.buffers[0].data()[0] = 0u; + auto iter = dictionary.buffers[1].begin(); + const auto offset = [&dictionary](){ return dictionary.buffers[0].data(); }; + + for (size_t i = 0; i < words.size(); ++i) + { + offset()[i+1] = offset()[i] + words[i].size(); + std::ranges::copy(words[i], iter); + iter += words[i].size(); + dictionary.bitmap.set(i, true); + } + dictionary.bitmap.set(4, false); + + dictionary.length = words.size(); + dictionary.offset = 0; + return dictionary; + } + + static constexpr std::array words = + { + "you", + "are", + "not", + "prepared", + "null" + }; + + array_data m_data; + using sub_layout_type = variable_size_binary_layout; + using layout_type = dictionary_encoded_layout; + }; + + TEST_SUITE("dictionary_encoded_layout") + { + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "constructors") + { + CHECK(m_data.buffers.size() == 1); + const layout_type l_copy(m_data); + CHECK(m_data.buffers.size() == 1); + const layout_type l_move(std::move(m_data)); + CHECK(m_data.buffers.size() == 0); + } + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "size") + { + const layout_type l(m_data); + CHECK_EQ(l.size(), element_count); + } + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "operator[]") + { + const layout_type l(m_data); + CHECK_EQ(l[0].value(), words[1]); + CHECK_EQ(l[1].value(), words[0]); + CHECK_EQ(l[2].value(), words[3]); + CHECK_EQ(l[3].value(), words[0]); + CHECK_EQ(l[4].value(), words[1]); + CHECK_EQ(l[5].value(), words[2]); + CHECK_EQ(l[6].value(), words[3]); + CHECK_EQ(l[7].value(), words[2]); + CHECK_FALSE(l[8].has_value()); + CHECK_FALSE(l[9].has_value()); + } + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "const_iterator") + { + const layout_type l(m_data); + auto iter = l.cbegin(); + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[0]); + ++iter; + --iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[0]); + iter += 2; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[2]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[3]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[4]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[5]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[6]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[7]); + ++iter; + CHECK(iter->has_value()); + CHECK_EQ(iter->value(), l[8]); + ++iter; + CHECK_FALSE(iter->has_value()); + ++iter; + CHECK_EQ(iter, l.cend()); + } + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "const_value_iterator") + { + layout_type l(m_data); + const auto vrange = l.values(); + auto iter = vrange.begin(); + CHECK_EQ(iter->value(), l[0].value()); + ++iter; + --iter; + CHECK_EQ(iter->value(), l[0].value()); + iter += 2; + CHECK_EQ(iter->value(), l[2].value()); + ++iter; + CHECK_EQ(iter->value(), l[3].value()); + ++iter; + CHECK_EQ(iter->value(), l[4].value()); + ++iter; + CHECK_EQ(iter->value(), l[5].value()); + ++iter; + CHECK_EQ(iter->value(), l[6].value()); + ++iter; + CHECK_EQ(iter->value(), l[7].value()); + ++iter; + CHECK_EQ(iter->has_value(), l[8].has_value()); + ++iter; + CHECK_EQ(iter->value(), words[2]); + ++iter; + CHECK_EQ(iter, vrange.end()); + } + + TEST_CASE_FIXTURE(dictionary_encoded_fixture, "const_bitmap_iterator") + { + const layout_type l(m_data); + const auto brange = l.bitmap(); + auto iter = brange.begin(); + CHECK(*iter); + ++iter; + CHECK(*iter); + iter += 8; + CHECK(!*iter); + ++iter; + CHECK_EQ(iter, brange.end()); + } + } +}