Skip to content

Commit

Permalink
Add dictionary encoded layout (#56)
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex-PLACET authored Apr 4, 2024
1 parent 9a19935 commit dbbfd27
Show file tree
Hide file tree
Showing 7 changed files with 569 additions and 19 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,6 @@

# CTest directory
/Testing/

# Clangd cache
.cache
1 change: 1 addition & 0 deletions include/sparrow/array_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ namespace sparrow
// Other buffers
std::vector<buffer_type> buffers;
std::vector<array_data> child_data;
std::shared_ptr<array_data> dictionary;
};

/**
Expand Down
344 changes: 344 additions & 0 deletions include/sparrow/dictionary_encoded_layout.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,344 @@
// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "sparrow/array_data.hpp"
#include "sparrow/iterator.hpp"
#include "sparrow/fixed_size_layout.hpp"
#include "sparrow/mp_utils.hpp"
#include "sparrow/iterator.hpp"

namespace sparrow
{
/**
* @class dictionary_value_iterator
*
* @brief Iterator over the data values of a dictionary layout.
*
* @tparam IL the layout type of the indexes.
* @tparam SL the layout type of the dictionary.
* @tparam is_const a boolean flag specifying whether this iterator is const.
*/
template <class IL, class SL, bool is_const>
class dictionary_value_iterator : public iterator_base<
dictionary_value_iterator<IL, SL, is_const>,
typename SL::value_type,
std::random_access_iterator_tag,
typename SL::const_reference>
{
public:

using self_type = dictionary_value_iterator<IL, SL, is_const>;
using base_type = iterator_base<
self_type,
typename SL::value_type,
std::random_access_iterator_tag,
typename SL::const_reference>;
using reference = typename base_type::reference;
using difference_type = typename base_type::difference_type;

using index_iterator = std::conditional_t<is_const, typename IL::const_value_iterator, typename IL::value_iterator>;
using sub_layout = mpl::constify_t<SL, is_const>;
using sub_layout_reference = sub_layout&;

// `dictionary_value_iterator` needs to be default constructible
// to satisfy `dictionary_encoded_layout::const_value_range`'s
// constraints.
dictionary_value_iterator() noexcept = default;
dictionary_value_iterator(index_iterator index_it, sub_layout_reference sub_layout_reference);

private:
reference dereference() const;
void increment();
void decrement();
void advance(difference_type n);
difference_type distance_to(const self_type& rhs) const;
bool equal(const self_type& rhs) const;
bool less_than(const self_type& rhs) const;

index_iterator m_index_it;
// Use std::optional because of default constructor.
std::optional<std::reference_wrapper<sub_layout>> m_sub_layout_reference;

friend class iterator_access;
};

/*
* @class dictionary_encoded_layout
*
* @brief Layout for arrays containing many duplicated values.
*
* Dictionary encoding is a data representation technique to represent values by
* integers referencing a dictionary usually consisting of unique values. It can
* be effective when you have data with many repeated values.
*
* Example:
*
* data VarBinary (dictionary-encoded)
* index_type: Int32
* values: [0, 1, 3, 1, 4, 2]
*
* dictionary
* type: VarBinary
* values: ['foo', 'bar', 'baz', 'foo', null]
*
* Traversing the values will give you the following:
* 'foo', 'bar', 'foo', 'bar', null, 'baz'
*
* @tparam IT the type of the index. Must be an integral.
* @tparam SL the layout type of the dictionary.
* @tparam OT type of the offset values. Must be std::int64_t or std::int32_t.
*/
template <std::integral IT, class SL, layout_offset OT = std::int64_t>
class dictionary_encoded_layout
{
public:
using self_type = dictionary_encoded_layout<IT, SL, OT>;
using index_type = IT;
using inner_value_type = SL::inner_value_type;
using sub_layout = SL;
using inner_reference = reference_proxy<SL>;
using inner_const_reference = const_reference_proxy<SL>;
using bitmap_type = array_data::bitmap_type;
using bitmap_const_reference = bitmap_type::const_reference;
using value_type = SL::value_type;
using reference = reference_proxy<SL>;
using const_reference = const_reference_proxy<SL>;
using size_type = std::size_t;
using indexes_layout = fixed_size_layout<IT>;
using iterator_tag = std::random_access_iterator_tag;

/**
* These types have to be public to be accessible when
* instantiating const_value_iterator for checking the
* requirements of subrange.
*/
using data_type = IT;

using offset_iterator = OT*;
using const_offset_iterator = const OT*;

using data_iterator = data_type*;
using const_data_iterator = const data_type*;

// TODO: implement the iterator once #35 is merged
// using iterator = layout_iterator<self_type, false>;
using const_iterator = layout_iterator<self_type, true>;

using bitmap_iterator = indexes_layout::bitmap_iterator;
using const_bitmap_iterator = indexes_layout::const_bitmap_iterator;
using const_bitmap_range = indexes_layout::const_bitmap_range;

using value_iterator = dictionary_value_iterator<indexes_layout, sub_layout, false>;
using const_value_iterator = dictionary_value_iterator<indexes_layout, sub_layout, true>;
using const_value_range = std::ranges::subrange<const_value_iterator, const_value_iterator>;

explicit dictionary_encoded_layout(array_data&& data);
explicit dictionary_encoded_layout(const array_data& data); // TODO: To remove when #51 will be merged

size_type size() const;
const_reference operator[](size_type i) const;

const_iterator cbegin() const;
const_iterator cend() const;

const_bitmap_range bitmap() const;
const_value_range values() const;

private:
const indexes_layout& get_const_indexes_layout() const;

const_value_iterator value_cbegin() const;
const_value_iterator value_cend() const;

inner_const_reference value(size_type i) const;

const_offset_iterator offset(size_type i) const;
const_offset_iterator offset_end() const;
const_data_iterator data(size_type i) const;

std::unique_ptr<indexes_layout> m_indexes_layout;
std::unique_ptr<sub_layout> m_sub_layout;

static const const_reference& dummy_const_reference(){
static const typename sub_layout::inner_value_type dummy_inner_value;
static const typename sub_layout::bitmap_type dummy_bitmap(1, false);
static const const_reference instance(dummy_inner_value, dummy_bitmap[0]);
return instance;
}

friend class const_reference_proxy<self_type>;
friend class dictionary_value_iterator<indexes_layout, sub_layout, true>;
};

/*******************************************
* vs_binary_value_iterator implementation *
*******************************************/

template <class L, class SL, bool is_const>
dictionary_value_iterator<L, SL, is_const>::dictionary_value_iterator(index_iterator index_it, sub_layout_reference sub_layout_reference)
: m_index_it(index_it)
, m_sub_layout_reference(sub_layout_reference)
{
}

template <class IL, class SL, bool is_const>
auto dictionary_value_iterator<IL, SL, is_const>::dereference() const -> reference
{
assert(m_sub_layout_reference.has_value());
return (*m_sub_layout_reference).get()[*m_index_it];
}

template <class IL, class SL, bool is_const>
void dictionary_value_iterator<IL, SL, is_const>::increment()
{
++m_index_it;
}

template <class IL, class SL, bool is_const>
void dictionary_value_iterator<IL, SL, is_const>::decrement()
{
--m_index_it;
}

template <class IL, class SL, bool is_const>
void dictionary_value_iterator<IL, SL, is_const>::advance(difference_type n)
{
m_index_it += n;
}

template <class IL, class SL, bool is_const>
auto dictionary_value_iterator<IL, SL, is_const>::distance_to(const self_type& rhs) const -> difference_type
{
m_index_it.distance_to(rhs.m_index_it);
}

template <class IL, class SL, bool is_const>
bool dictionary_value_iterator<IL, SL, is_const>::equal(const self_type& rhs) const
{
return m_index_it == rhs.m_index_it;
}

template <class IL, class SL, bool is_const>
bool dictionary_value_iterator<IL, SL, is_const>::less_than(const self_type& rhs) const
{
return m_index_it < rhs.m_index_it;
}

/**********************************************
* dictionary_encoded_layout implementation *
**********************************************/

template <std::integral T, class SL, layout_offset OT>
dictionary_encoded_layout<T, SL, OT>::dictionary_encoded_layout(const array_data& data)
{
assert(data.dictionary);
m_sub_layout = std::make_unique<SL>(*data.dictionary);
m_indexes_layout = std::make_unique<indexes_layout>(data);
}

template <std::integral T, class SL, layout_offset OT>
dictionary_encoded_layout<T, SL, OT>::dictionary_encoded_layout(array_data&& data)
{
assert(data.dictionary);
m_sub_layout = std::make_unique<SL>(std::move(*data.dictionary));
m_indexes_layout = std::make_unique<indexes_layout>(std::move(data));
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::size() const -> size_type
{
return m_indexes_layout->size();
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::operator[](size_type i) const -> const_reference
{
assert(i < size());
const auto index = (*m_indexes_layout)[i];
if (index.has_value()) {
return (*m_sub_layout)[index.value()];
}
else {
return dummy_const_reference();
}
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::bitmap() const -> const_bitmap_range
{
return get_const_indexes_layout().bitmap();
}

template <std::integral T, class SL, layout_offset OT>
const typename dictionary_encoded_layout<T, SL, OT>::indexes_layout& dictionary_encoded_layout<T, SL, OT>::get_const_indexes_layout() const
{
return *const_cast<const indexes_layout*>(m_indexes_layout.get());
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::cbegin() const -> const_iterator
{
return const_iterator(value_cbegin(), get_const_indexes_layout().bitmap().begin());
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::cend() const -> const_iterator
{
return const_iterator(value_cend(), get_const_indexes_layout().bitmap().end());
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::value_cbegin() const -> const_value_iterator
{
return const_value_iterator(get_const_indexes_layout().values().begin(), *m_sub_layout);
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::value_cend() const -> const_value_iterator
{
return const_value_iterator(get_const_indexes_layout().values().end(), *m_sub_layout);
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::values() const -> const_value_range
{
return const_value_range(value_cbegin(), value_cend());
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::value(size_type i) const -> inner_const_reference
{
return inner_const_reference(data(*offset(i)), data(*offset(i + 1)));
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::offset(size_type i) const -> const_offset_iterator
{
return m_indexes_layout->offset(i);
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::offset_end() const -> const_offset_iterator
{
return m_indexes_layout->offset_end();
}

template <std::integral T, class SL, layout_offset OT>
auto dictionary_encoded_layout<T, SL, OT>::data(size_type i) const -> const_data_iterator
{
return m_sub_layout->data(i);
}
} // namespace sparrow
15 changes: 15 additions & 0 deletions include/sparrow/mp_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,19 @@ namespace sparrow
template <class T, bool is_const>
using constify_t = typename constify<T, is_const>::type;
}

namespace impl
{
template <class C, bool is_const>
struct get_inner_reference
: std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
{
};

template <class C, bool is_const>
using get_inner_reference_t = typename get_inner_reference<C, is_const>::type;
} // namespace impl

template <class T>
concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
}
Loading

0 comments on commit dbbfd27

Please sign in to comment.