Skip to content

Commit

Permalink
betterment
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipDeegan committed Mar 23, 2024
1 parent 8b44c8c commit b84a312
Show file tree
Hide file tree
Showing 16 changed files with 227 additions and 125 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: test
run: |
Expand Down
20 changes: 8 additions & 12 deletions inc/mkn/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _MKN_GPU_HPP_
#define _MKN_GPU_HPP_

#if defined(MKN_GPU_ROCM)
#include "mkn/gpu/rocm.hpp"
#elif defined(MKN_GPU_CUDA)
#include "mkn/gpu/cuda.hpp"
#elif defined(MKN_GPU_CPU)
#include "mkn/gpu/cpu.hpp"
#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0
#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
#endif
#include "mkn/gpu/defines.hpp"

namespace mkn::gpu {

__device__ uint32_t idx() {
#if defined(MKN_GPU_ROCM)
#if MKN_GPU_ROCM
return mkn::gpu::hip::idx();
#elif defined(MKN_GPU_CUDA)

#elif MKN_GPU_CUDA
return mkn::gpu::cuda::idx();
#elif defined(MKN_GPU_CPU)

#elif MKN_GPU_CPU
return mkn::gpu::cpu::idx();

#else
#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"

#endif
}

Expand Down
8 changes: 5 additions & 3 deletions inc/mkn/gpu/alloc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,18 @@ class ManagedAllocator {

template <typename T, typename Size>
void copy(T* const dst, T const* const src, Size size) {
auto dst_p = Pointer{dst};
auto src_p = Pointer{src};
assert(dst and src);

Pointer src_p{src};
Pointer dst_p{dst};

bool to_send = dst_p.is_device_ptr() && src_p.is_host_ptr();
bool to_take = dst_p.is_host_ptr() && src_p.is_device_ptr();

if (to_send)
send(dst, src, size);
else if (to_take)
take(dst, src, size);
take(src, dst, size);
else
throw std::runtime_error("Unsupported operation (PR welcome)");
}
Expand Down
23 changes: 18 additions & 5 deletions inc/mkn/gpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,27 @@ struct Stream {
std::size_t stream = 0;
};

struct StreamEvent {
StreamEvent(Stream&) {}
~StreamEvent() {}

auto& operator()() { return event; };
void record() { ; }
bool finished() const { return true; }
void reset() {}

Stream stream;
std::size_t event = 0;
};

template <typename T>
struct Pointer {
Pointer(T* _t) : t{_t} {}

bool is_unregistered_ptr() const { return t == nullptr; }
bool is_host_ptr() const { return true; }
bool is_device_ptr() const { return false; }
bool is_managed_ptr() const { return false; }
bool is_device_ptr() const { return true; }
bool is_managed_ptr() const { return true; }

T* t;
};
Expand Down Expand Up @@ -129,7 +142,7 @@ void alloc_managed(T*& p, Size size) {
MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
}

void destroy(void* p) {
void inline destroy(void* p) {
KLOG(TRC);
std::free(p);
}
Expand Down Expand Up @@ -177,7 +190,7 @@ void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) {
take(p, span.data(), span.size(), start);
}

void sync() {}
void inline sync() {}

#include "mkn/gpu/alloc.hpp"
#include "mkn/gpu/device.hpp"
Expand All @@ -186,7 +199,7 @@ namespace detail {
static thread_local std::size_t idx = 0;
}

template <typename F, typename... Args>
template <bool _sync = true, typename F, typename... Args>
void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
KLOG(TRC) << N;
Expand Down
70 changes: 46 additions & 24 deletions inc/mkn/gpu/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <vector>

#include <cuda_runtime.h>

#include "mkn/kul/log.hpp"
#include "mkn/kul/span.hpp"
#include "mkn/kul/tuple.hpp"
#include "mkn/kul/assert.hpp"

#include <cuda_runtime.h>
#include "mkn/gpu/def.hpp"

// #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess))
Expand All @@ -54,6 +53,22 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort =
}
}

namespace mkn::gpu::cuda {

template <typename SIZE = std::uint32_t /*max 4294967296*/>
__device__ SIZE idx() {
SIZE width = gridDim.x * blockDim.x;
SIZE height = gridDim.y * blockDim.y;
SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
return x + (y * width) + (z * width * height);
}

} // namespace mkn::gpu::cuda

//

#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
#define MKN_GPU_NS mkn::gpu::cuda
#else
Expand All @@ -74,6 +89,24 @@ struct Stream {
cudaStream_t stream;
};

struct StreamEvent {
StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
~StreamEvent() { /*MKN_GPU_ASSERT(result = cudaEventDestroy(event));*/
}

auto& operator()() { return event; };
void record() { MKN_GPU_ASSERT(result = cudaEventRecord(event, stream())); }
bool finished() const { return cudaEventQuery(event) == cudaSuccess; }
void reset() {
if (event) MKN_GPU_ASSERT(result = cudaEventDestroy(event));
MKN_GPU_ASSERT(result = cudaEventCreate(&event));
}

Stream& stream;
cudaError_t result;
cudaEvent_t event = nullptr;
};

template <typename T>
struct Pointer {
Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
Expand Down Expand Up @@ -112,7 +145,7 @@ void alloc_managed(T*& p, Size size) {
MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T)));
}

void destroy(void* p) {
void inline destroy(void* p) {
KLOG(TRC);
MKN_GPU_ASSERT(cudaFree(p));
}
Expand Down Expand Up @@ -147,23 +180,6 @@ void take(T const* p, T* t, Size size = 1, Size start = 0) {
MKN_GPU_ASSERT(cudaMemcpy(t, p + start, size * sizeof(T), cudaMemcpyDeviceToHost));
}

template <typename T, typename Size>
void copy(T* dst, T const* src, Size size = 1, Size start = 0) {
KLOG(TRC);
Pointer p{dst};
if (p.is_host_ptr())
take(src, dst, size, start);
else
send(dst, src, size, start);
}

template <typename Type, typename Alloc0, typename Alloc1>
void copy(std::vector<Type, Alloc0>& dst, std::vector<Type, Alloc1> const& src) {
KLOG(TRC);
assert(dst.size() >= src.size());
copy(dst.data(), src.data(), dst.size());
}

template <typename T, typename Size>
void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) {
KLOG(TRC);
Expand All @@ -185,19 +201,25 @@ void take_async(T* p, Span& span, Stream& stream, std::size_t start) {
stream()));
}

void sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); }

#include "mkn/gpu/alloc.hpp"
#include "mkn/gpu/device.hpp"

template <typename F, typename... Args>
template <bool _sync = true, typename F, typename... Args>
void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
KLOG(TRC) << N;
std::apply(
[&](auto&&... params) { f<<<g, b, ds, s>>>(params...); },
devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
sync();
if constexpr (_sync) {
if (s)
sync(s);
else
sync();
}
}

//
Expand Down Expand Up @@ -254,7 +276,7 @@ void fill(Container& c, T val) {
}

//
void prinfo(size_t dev = 0) {
void inline prinfo(size_t dev = 0) {
cudaDeviceProp devProp;
[[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
Expand Down
24 changes: 0 additions & 24 deletions inc/mkn/gpu/cuda/def.hpp

This file was deleted.

10 changes: 0 additions & 10 deletions inc/mkn/gpu/def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,6 @@

#include <type_traits>

#if defined(MKN_GPU_ROCM)
#include "mkn/gpu/rocm/def.hpp"
#elif defined(MKN_GPU_CUDA)
#include "mkn/gpu/cuda/def.hpp"
#elif defined(MKN_GPU_CPU)

#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0
#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
#endif

namespace mkn::gpu {

#if defined(MKN_GPU_CPU)
Expand Down
52 changes: 52 additions & 0 deletions inc/mkn/gpu/defines.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@


#ifndef _MKN_GPU_DEFINES_HPP_
#define _MKN_GPU_DEFINES_HPP_

#include <type_traits>

#if !defined(MKN_GPU_FN_PER_NS)
#define MKN_GPU_FN_PER_NS 0
#endif

#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
#define MKN_GPU_ROCM 1
#endif
#if !defined(MKN_GPU_ROCM)
#define MKN_GPU_ROCM 0
#endif

#if !defined(MKN_GPU_CUDA) and __has_include(<cuda_runtime.h>)
#define MKN_GPU_CUDA 1
#endif
#if !defined(MKN_GPU_CUDA)
#define MKN_GPU_CUDA 0
#endif

#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0
#define MKN_GPU_FN_PER_NS 1
#endif

#if MKN_GPU_ROCM == 1
#include "mkn/gpu/rocm.hpp"
#endif

#if MKN_GPU_CUDA
#include "mkn/gpu/cuda.hpp"
#endif

#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
#include "mkn/gpu/cpu.hpp"
#endif

namespace mkn::gpu {

struct CompileFlags {
bool constexpr static withCUDA = MKN_GPU_CUDA;
bool constexpr static withROCM = MKN_GPU_ROCM;
bool constexpr static perNamespace = MKN_GPU_FN_PER_NS;
};

} /* namespace mkn::gpu */

#endif /*_MKN_GPU_DEFINES_HPP_*/
3 changes: 3 additions & 0 deletions inc/mkn/gpu/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ struct DeviceMem {

auto& size() const { return s; }

auto* data() { return p; }
auto* data() const { return p; }

std::size_t s = 0;
T* p = nullptr;
bool owned = false;
Expand Down
16 changes: 12 additions & 4 deletions inc/mkn/gpu/launchers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _MKN_GPU_LAUNCHERS_HPP_
#define _MKN_GPU_LAUNCHERS_HPP_

template <bool _sync = true>
struct GDLauncher : public GLauncher {
GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}

template <typename F, typename... Args>
auto operator()(F&& f, Args&&... args) {
_launch(std::forward<F>(f),
_launch(s, std::forward<F>(f),
as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
count, args...);
}

template <typename F, typename... Args>
auto stream(Stream& s, F&& f, Args&&... args) {
_launch(s.stream, std::forward<F>(f),
as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
count, args...);
}
Expand All @@ -48,9 +56,9 @@ struct GDLauncher : public GLauncher {
return T{nullptr};
}

template <typename F, typename... PArgs, typename... Args>
void _launch(F&& f, std::tuple<PArgs&...>*, Args&&... args) {
MKN_GPU_NS::launch(&global_gd_kernel<F, PArgs...>, g, b, ds, s, f, args...);
template <typename S, typename F, typename... PArgs, typename... Args>
void _launch(S& _s, F&& f, std::tuple<PArgs&...>*, Args&&... args) {
MKN_GPU_NS::launch<_sync>(&global_gd_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
}
};

Expand Down
Loading

0 comments on commit b84a312

Please sign in to comment.