Skip to content

Commit

Permalink
host group mutex etc (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipDeegan authored Dec 8, 2024
1 parent 0a5fdf2 commit 9924da7
Show file tree
Hide file tree
Showing 11 changed files with 455 additions and 274 deletions.
12 changes: 7 additions & 5 deletions inc/mkn/gpu/alloc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ class NoConstructAllocator : public MknGPUAllocator<T, alignment> {
};

template <typename U, typename... Args>
void construct(U* /*ptr*/, Args&&... /*args*/) {} // nothing
void construct(U* ptr, Args&&... args) {
::new ((void*)ptr) U(std::forward<Args>(args)...);
}
template <typename U>
void construct(U* /*ptr*/) noexcept(std::is_nothrow_default_constructible<U>::value) {}
};
Expand All @@ -104,8 +106,8 @@ std::vector<T, MknGPUAllocator<T, align>>& as_super(std::vector<T, ManagedAlloca
return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
}

template <typename T, typename Size>
void copy(T* dst, T* src, Size size) {
template <typename T0, typename T1, typename Size>
void copy(T0* dst, T1* src, Size const size) {
assert(dst and src);

Pointer src_p{src};
Expand Down Expand Up @@ -135,7 +137,7 @@ auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t con
v.reserve(s);
return v;
}
std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
std::vector<T, NoConstructAllocator<T, align>> cpy(NoConstructAllocator<T, align>{});
cpy.reserve(s);
cpy.resize(v.size());
if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
Expand All @@ -150,7 +152,7 @@ auto& resize(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t cons
v.resize(s);
return v;
}
std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
std::vector<T, NoConstructAllocator<T, align>> cpy(NoConstructAllocator<T, align>{});
cpy.resize(s);
if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
v = std::move(cpy);
Expand Down
35 changes: 15 additions & 20 deletions inc/mkn/gpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,15 @@ struct dim3 {
std::size_t x = 1, y = 1, z = 1;
};

void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/

void setDevice(std::size_t const& /*dev*/) {} /*noop*/

auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
int supportsCoopLaunch = 0;
return supportsCoopLaunch;
}
void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/

struct Stream {
Stream() {}
~Stream() {}

auto& operator()() { return stream; };

void sync() {}

std::size_t stream = 0;
Expand All @@ -107,18 +101,16 @@ struct StreamEvent {
StreamEvent(Stream&) {}
~StreamEvent() {}

auto& operator()() { return event; };
auto& record() {
++stage;
auto& operator()(std::function<void()> fn = {}) {
fn();
return *this;
}
auto& wait() { return *this; }
bool finished() const { return stage == 2; }
void reset() { stage = 0; }

bool finished() const { return fin; }

Stream stream;
std::size_t event = 0;
std::uint16_t stage = 0;
bool fin = 1;
std::function<void()> _fn;
};

template <typename T>
Expand Down Expand Up @@ -220,7 +212,7 @@ namespace detail {
static thread_local std::size_t idx = 0;
}

template <bool _sync = true, bool _coop = false, typename F, typename... Args>
template <bool _sync = true, typename F, typename... Args>
void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
KLOG(TRC) << N;
Expand Down Expand Up @@ -264,7 +256,7 @@ struct GLauncher : public Launcher {
std::size_t count;
};

void prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }

} // namespace MKN_GPU_NS

Expand All @@ -284,9 +276,12 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) {
if (auto i = mkn::gpu::cpu::idx(); i < s) f(args...);
}

#include "launchers.hpp"
template <typename F, typename... Args>
static void global_d_kernel(F& f, Args... args) {
f(args...);
}

void grid_sync() {}
#include "launchers.hpp"

} /* namespace MKN_GPU_NS */

Expand Down
100 changes: 36 additions & 64 deletions inc/mkn/gpu/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "mkn/kul/tuple.hpp"
#include "mkn/kul/assert.hpp"

#include "mkn/gpu/def.hpp"
#include "mkn/gpu/cli.hpp"

#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include "mkn/gpu/def.hpp"

//

#define MKN_GPU_ASSERT(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
#define MKN_GPU_ASSERT(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
Expand Down Expand Up @@ -84,17 +86,11 @@ __device__ SIZE block_idx_x() {

namespace MKN_GPU_NS {

void setLimitMallocHeapSize(std::size_t const& bytes) {
void inline setLimitMallocHeapSize(std::size_t const& bytes) {
MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
}

void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }

auto supportsCooperativeLaunch(int const dev = 0) {
int supportsCoopLaunch = 0;
MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
return supportsCoopLaunch;
}
void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }

struct Stream {
Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
Expand All @@ -108,56 +104,39 @@ struct Stream {
cudaStream_t stream;
};

struct StreamEvent {
StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
~StreamEvent() { clear(); }

StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} {
that.start = nullptr;
that.stop = nullptr;
}
//

struct StreamEvent {
//
StreamEvent(Stream& stream_) : stream{stream_} {}
StreamEvent(StreamEvent&& that) = default;
StreamEvent(StreamEvent const&) = delete;
StreamEvent& operator=(StreamEvent const&) = delete;

auto& operator()() { return stop; };
auto& record() {
if (stage == 0) {
MKN_GPU_ASSERT(result = cudaEventRecord(start, stream()));
++stage;
} else {
MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream()));
++stage;
}
return *this;
}
auto& wait() {
if (stage == 0) {
MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), start));
} else {
MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), stop));
}
auto& operator()(std::function<void()> fn = {}) {
fin = 0;
_fn = fn;
MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
return *this;
}

void clear() {
if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start));
if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop));
}
bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; }
void reset() {
clear();
MKN_GPU_ASSERT(result = cudaEventCreate(&start));
MKN_GPU_ASSERT(result = cudaEventCreate(&stop));
stage = 0;
static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) {
auto& self = *reinterpret_cast<StreamEvent*>(ptr);
self._fn();
self._fn = [] {};
self.fin = 1;
}

bool finished() const { return fin; }

Stream& stream;
cudaError_t result;
cudaEvent_t start = nullptr, stop = nullptr;
std::uint16_t stage = 0;
std::function<void()> _fn;
bool fin = 0;
};

//

template <typename T>
struct Pointer {
Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
Expand Down Expand Up @@ -264,19 +243,13 @@ void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(str
#include "mkn/gpu/alloc.hpp"
#include "mkn/gpu/device.hpp"

template <bool _sync = true, bool _coop = false, typename F, typename... Args>
template <bool _sync = true, typename F, typename... Args>
void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
KLOG(TRC) << N;
std::apply(
[&](auto&&... params) {
if constexpr (_coop) {
auto address_of = [](auto& a) { return (void*)&a; };
void* kernelArgs[] = {(address_of(params), ...)};
cudaLaunchCooperativeKernel((void*)f, g, b, kernelArgs, ds);
} else {
f<<<g, b, ds, s>>>(params...);
}
f<<<g, b, ds, s>>>(params...);
MKN_GPU_ASSERT(cudaGetLastError());
},
devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
Expand Down Expand Up @@ -331,6 +304,11 @@ __global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
}

template <typename F, typename... Args>
__global__ static void global_d_kernel(F f, Args... args) {
f(args...);
}

#include "launchers.hpp"

template <typename T, typename V>
Expand Down Expand Up @@ -373,14 +351,8 @@ void print_gpu_mem_used() {
total_t, total_m, used_m);
}

__device__ void grid_sync() {
namespace cg = cooperative_groups;
cg::grid_group grid = cg::this_grid();
assert(grid.is_valid());
grid.sync();
}

} // namespace MKN_GPU_NS

#undef MKN_GPU_ASSERT

#endif /* _MKN_GPU_CUDA_HPP_ */
4 changes: 2 additions & 2 deletions inc/mkn/gpu/def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ static constexpr bool is_floating_point_v =
#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ */

#ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_
#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ 10
#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ 1
#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ */

#ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_
#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ 100
#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ 25
#endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ */

} /* namespace mkn::gpu */
Expand Down
17 changes: 11 additions & 6 deletions inc/mkn/gpu/defines.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
#ifndef _MKN_GPU_DEFINES_HPP_
#define _MKN_GPU_DEFINES_HPP_

#if !defined(MKN_GPU_FN_PER_NS)
#define MKN_GPU_FN_PER_NS 0
#endif

#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
#define MKN_GPU_ROCM 1
#endif
Expand All @@ -19,18 +15,27 @@
#define MKN_GPU_CUDA 0
#endif

#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0
#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && !defined(MKN_GPU_FN_PER_NS)
#define MKN_GPU_FN_PER_NS 1
#endif

#if !defined(MKN_GPU_FN_PER_NS)
#define MKN_GPU_FN_PER_NS 0
#endif

#if MKN_GPU_ROCM == 1
#include "mkn/gpu/rocm.hpp"
#endif

#if MKN_GPU_CUDA
#if MKN_GPU_CUDA == 1
#include "mkn/gpu/cuda.hpp"
#endif

#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU)
#pragma message("mkn.gpu error: No accelerator found, defaulting to CPU IMP")
#define MKN_GPU_CPU 1
#endif

#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
#include "mkn/gpu/cpu.hpp"
#endif
Expand Down
Loading

0 comments on commit 9924da7

Please sign in to comment.