Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Meta-programming tools and arrow types traits support. #49

Merged
merged 21 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,14 @@ set(SPARROW_HEADERS
${SPARROW_INCLUDE_DIR}/sparrow/buffer.hpp
${SPARROW_INCLUDE_DIR}/sparrow/fixed_size_layout.hpp
${SPARROW_INCLUDE_DIR}/sparrow/data_type.hpp
${SPARROW_INCLUDE_DIR}/sparrow/data_traits.hpp
${SPARROW_INCLUDE_DIR}/sparrow/dynamic_bitset.hpp
${SPARROW_INCLUDE_DIR}/sparrow/iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/mp_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/sparrow_version.hpp
${SPARROW_INCLUDE_DIR}/sparrow/variable_size_binary_layout.hpp

${SPARROW_INCLUDE_DIR}/sparrow/details/3rdparty/float16_t.hpp
)

add_library(sparrow INTERFACE)
Expand Down
141 changes: 141 additions & 0 deletions include/sparrow/data_traits.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "data_type.hpp"
#include "fixed_size_layout.hpp"
#include "variable_size_binary_layout.hpp"

namespace sparrow
{

template<class T>
struct common_native_types_traits
{
using value_type = T;
using default_layout = fixed_size_layout<T>;
};

template<>
struct arrow_traits<std::nullopt_t>
{
static constexpr data_type type_id = data_type::NA;
using value_type = std::nullopt_t;
using default_layout = fixed_size_layout<value_type>; // TODO: replace this by a special layout that's always empty

};

template<>
struct arrow_traits<bool> : common_native_types_traits<bool>
{
static constexpr data_type type_id = data_type::BOOL;
};

template<>
struct arrow_traits<std::uint8_t> : common_native_types_traits<std::uint8_t>
{
static constexpr data_type type_id = data_type::UINT8;
};

template<>
struct arrow_traits<std::int8_t> : common_native_types_traits<std::int8_t>
{
static constexpr data_type type_id = data_type::INT8;
};

template<>
struct arrow_traits<std::uint16_t> : common_native_types_traits<std::uint16_t>
{
static constexpr data_type type_id = data_type::UINT16;
};

template<>
struct arrow_traits<std::int16_t> : common_native_types_traits<std::int16_t>
{
static constexpr data_type type_id = data_type::INT16;
};

template<>
struct arrow_traits<std::uint32_t> : common_native_types_traits<std::uint32_t>
{
static constexpr data_type type_id = data_type::UINT32;
};

template<>
struct arrow_traits<std::int32_t> : common_native_types_traits<std::int32_t>
{
static constexpr data_type type_id = data_type::INT32;
};

template<>
struct arrow_traits<std::uint64_t> : common_native_types_traits<std::uint64_t>
{
static constexpr data_type type_id = data_type::UINT64;
};

template<>
struct arrow_traits<std::int64_t> : common_native_types_traits<std::int64_t>
{
static constexpr data_type type_id = data_type::INT64;
};

template<>
struct arrow_traits<float16_t> : common_native_types_traits<float16_t>
{
static constexpr data_type type_id = data_type::HALF_FLOAT;
};

template<>
struct arrow_traits<float32_t> : common_native_types_traits<float32_t>
{
static constexpr data_type type_id = data_type::FLOAT;
};

template<>
struct arrow_traits<float64_t> : common_native_types_traits<float64_t>
{
static constexpr data_type type_id = data_type::DOUBLE;
};

template<>
struct arrow_traits<std::string>
{
static constexpr data_type type_id = data_type::STRING;
using value_type = std::string;
using default_layout = variable_size_binary_layout<value_type, std::string_view, std::string_view>; // FIXME: this is incorrect, change when we have the right types
Klaim marked this conversation as resolved.
Show resolved Hide resolved

};

namespace predicate
{

struct {
template<class T>
consteval bool operator()(mpl::typelist<T>)
{
return sparrow::is_arrow_base_type<T>;
}
} constexpr is_arrow_base_type;

struct {
template<class T>
consteval bool operator()(mpl::typelist<T>)
{
return sparrow::is_arrow_traits< arrow_traits<T> >;
}
} constexpr has_arrow_traits;
}

}
164 changes: 163 additions & 1 deletion include/sparrow/data_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,46 @@

#pragma once

#include <cstdint>
#include <climits>
#include <string>
#include <vector>
#include <optional>

#include "sparrow/mp_utils.hpp"

// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see https://en.cppreference.com/w/cpp/types/floating-point
#if __cplusplus <= 202002L
# include "details/3rdparty/float16_t.hpp"
#else
# include <stdfloat>
#endif


namespace sparrow
{
// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see https://en.cppreference.com/w/cpp/types/floating-point
#if __cplusplus <= 202002L
using float16_t = numeric::float16_t;
using float32_t = float;
using float64_t = double;
#else
using float16_t = std::float16_t;
using float32_t = std::float32_t;
using float64_t = std::float64_t;
#endif

// We need to be sure the current target platform is setup to support correctly these types.
static_assert(sizeof(float16_t) == 2);
static_assert(sizeof(float32_t) == 4);
static_assert(sizeof(float64_t) == 8);
static_assert(std::is_floating_point_v<float16_t>);
static_assert(std::is_floating_point_v<float32_t>);
static_assert(std::is_floating_point_v<float64_t>);
static_assert(CHAR_BIT == 8);


/// Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
// TODO: does not support all types specified by the Arrow specification
// yet
enum class data_type
Expand All @@ -38,9 +76,114 @@ namespace sparrow
// Variable-length bytes (no guarantee of UTF8-ness)
BINARY,
// Fixed-size binary. Each value occupies the same number of bytes
FIXED_SIZE_BINARY
FIXED_SIZE_BINARY,
};

/// C++ types value representation types matching Arrow types.
// NOTE: this needs to be in sync-order with `data_type`
using all_base_types_t = mpl::typelist
<
std::nullopt_t // REVIEW: not sure about if we need to have this one? for representing NA? is this the right type?
Klaim marked this conversation as resolved.
Show resolved Hide resolved
, bool
, std::uint8_t
, std::int8_t
, std::uint16_t
, std::int16_t
, std::uint32_t
, std::int32_t
, std::uint64_t
, std::int64_t
, float16_t
, float32_t
, float64_t
, std::string
//, std::vector<std::byte> // REVIEW should this be uint8_t? char? buffer<unit8_t>?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's true that we have used uint8_t over std::byte for the buffer.

Is uint8_t use for interoperability with C, or are there other reason not to use std::byte?

Also, if we use uint8_t we must:

static_assert(CHAR_BIT == 8);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main "classic" issue is that because std::byte has a strong type, not implicitly converted to other similar types, it's not automatically compatible with functions working with char or char-like types, in particular when interfacing with C. Lots of explicit casts ensures. Not a big problem if you like correctness but it does obfuscate the code. It's the main criticism I'm aware of.

In this library, I'm not sure what would be best, but the C api kinds of forces us with at least some of the types?

Also, if we use uint8_t we must:

hmm actually shouldnt we always check that? I'll add it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think, @JohanMabille?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The C Api uses void* for the buffers, so we'll have to cast anyway. I would actually be in favor of replacing the uint8_t with byte in the array_data buffers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I went with an alias byte_t so that we can change that easilly, and added vector<byte_t> and I suppose we'll see how it goes from here.

// TODO: add missing fundamental types here
>;

/// Type list of every C++ representation types supported by default, in order matching `data_type` related values.
static constexpr all_base_types_t all_base_types;

/// Matches C++ representation types which are supported by default.
template< class T >
concept is_arrow_base_type = mpl::contains<T>(all_base_types);

/// Provides compile-time information about Arrow data types.
/// Custom types can be made compatible by implementing this traits type.
///
/// The following information must be provided if the type is an arrow type:
/// - type_id : the runtime identifier value for that type, see `data_type`
/// - value_type : the value representation type to use in C++ (usually T)
/// - default_layout: the layout to use for that type FIXME: be more precise than that
/// - MORE TO COME SOON, SEE TODOs BELOW
///
/// @tparam Type for C++ value-representation that this type describes the traits of.
Klaim marked this conversation as resolved.
Show resolved Hide resolved
///
/// @note: See ./arrow_traits.hpp for implementations for default base types.
/// @see `is_arrow_traits`, `has_arrow_type_traits`
template<class T>
struct arrow_traits;

/// Matches valid and complete `arrow_traits` specializations for type T.
/// Every type that needs to be compatible with this library's interface must
/// provide a specialization of `arrow_traits`
/// @see `arrow_traits`, `has_arrow_type_traits`
template<class T>
concept is_arrow_traits = mpl::is_type_instance_of_v< T, arrow_traits >
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Semantically, I think we should rename arrow_traits or is_arrow_traits given the definition of this concept.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didnt get what you meant, but just in case, here is the reason I named them this way:

  • arrow_traits is similar in purpose to std::iterator_traits so I followed that naming pattern;
  • I can't have both traits types and a concept with the same name, and a concept is a kind of compile-time predicate, using is_... makes sense if I dont have the choice of just naming it arrow_traits.

I'm open to suggestions for alternatives, you seem to think is_arrow_traits doesnt match what it does or at least the documentation?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my remark was not clear.

is_arrow_traits checks for more than a template type parameter being an arrow_traits: is_arrow_traits currently is defined on several condition including the one here (on line 130), but also other ones bellow (which my previous comment does not show nor mention). In this regards, I think the naming is inconsistent.

What do you think?

Copy link
Collaborator Author

@Klaim Klaim Apr 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These conditions (mostly interface checks) are all requirements for a valid (as in usable by the rest of the library) arrow_traits specialization, so the name seems fine to me, but maybe I'm missing your point? Are you suggesting something like is_valid_arrow_traits, to be clearer?

and requires
{
/// Must provide a compile-time value of type `data_type`.
/// This is used to identify which arrow data type is represented in `value_type`
requires std::same_as< std::remove_cvref_t<decltype(T::type_id)>, ::sparrow::data_type >;

/// The C++ representation of the arrow value. For `arrow_traits<X>`, this is usually `X`.
typename T::value_type;

/// The arrow (binary) layout to use by default for representing a set of data for that type.
typename T::default_layout;

// TODO: add more interface requirements on the traits here
// TODO: add conversion operations between bytes and the value type
}
;


/// Matches types providing valid and complete `arrow_traits` specialization.
/// @see `is_arrow_traits`, `arrow_traits`
template< class T >
concept has_arrow_type_traits =
requires { typename ::sparrow::arrow_traits<T>; }
and is_arrow_traits< ::sparrow::arrow_traits<T> >
;

/// Matches any type which is one of the base C++ types supported or at least that provides an `arrow_traits` specialization.
template< class T >
concept any_arrow_type = is_arrow_base_type<T> or has_arrow_type_traits<T>;

/// @returns Arrow type id to use for a given C++ representation of that type.
/// @see `arrow_traits`
template< has_arrow_type_traits T >
constexpr
auto arrow_type_id() -> data_type
{
return arrow_traits<T>::type_id;
}

/// @returns Arrow type id to use for the type of a given object.
/// @see `arrow_traits`
template< has_arrow_type_traits T >
constexpr
auto arrow_type_id(const T&) -> data_type
{
return arrow_type_id<T>();
}


/// Binary layout type to use by default for the given C++ representation T of an arrow value.
template< has_arrow_type_traits T >
using default_layout_t = typename arrow_traits<T>::default_layout;


// For now, a tiny wrapper around data_type
// More data and functions to come
class data_descriptor
Expand All @@ -63,5 +206,24 @@ namespace sparrow

data_type m_id;
};


namespace impl
{
template <class C, bool is_const>
struct get_inner_reference
: std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
{
};

template <class C, bool is_const>
using get_inner_reference_t = typename get_inner_reference<C, is_const>::type;
} // namespace impl

template <class T>
concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;



}

7 changes: 7 additions & 0 deletions include/sparrow/details/3rdparty/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# 3rd Party sources

This directory contains 3rd-party source code that we decided not to handle through automatic dependency management for various reasons described below.
Please prefer automatic dependency management if you can!

Reasonning for:
- `float16_t.hpp`: we need a `float16_t` type but currently cannot use C++23's standard definition as some toolchains cannot use the most recent versions of compilers supporting C++23. Hence we decided to use this header (modified with a note about where it was taken from) but only if building with C++<23
Loading
Loading