host group mutex etc (#23)

mkn · Dec 8, 2024 · 9924da7 · 9924da7
1 parent 0a5fdf2
commit 9924da7
Show file tree

Hide file tree

Showing 11 changed files with 455 additions and 274 deletions.
diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/alloc.hpp
@@ -79,7 +79,9 @@ class NoConstructAllocator : public MknGPUAllocator<T, alignment> {
   };
 
   template <typename U, typename... Args>
-  void construct(U* /*ptr*/, Args&&... /*args*/) {}  // nothing
+  void construct(U* ptr, Args&&... args) {
+    ::new ((void*)ptr) U(std::forward<Args>(args)...);
+  }
   template <typename U>
   void construct(U* /*ptr*/) noexcept(std::is_nothrow_default_constructible<U>::value) {}
 };
@@ -104,8 +106,8 @@ std::vector<T, MknGPUAllocator<T, align>>& as_super(std::vector<T, ManagedAlloca
   return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
 }
 
-template <typename T, typename Size>
-void copy(T* dst, T* src, Size size) {
+template <typename T0, typename T1, typename Size>
+void copy(T0* dst, T1* src, Size const size) {
   assert(dst and src);
 
   Pointer src_p{src};
@@ -135,7 +137,7 @@ auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t con
     v.reserve(s);
     return v;
   }
-  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  std::vector<T, NoConstructAllocator<T, align>> cpy(NoConstructAllocator<T, align>{});
   cpy.reserve(s);
   cpy.resize(v.size());
   if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
@@ -150,7 +152,7 @@ auto& resize(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t cons
     v.resize(s);
     return v;
   }
-  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  std::vector<T, NoConstructAllocator<T, align>> cpy(NoConstructAllocator<T, align>{});
   cpy.resize(s);
   if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
   v = std::move(cpy);

diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
@@ -83,21 +83,15 @@ struct dim3 {
   std::size_t x = 1, y = 1, z = 1;
 };
 
-void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
+void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
 
-void setDevice(std::size_t const& /*dev*/) {} /*noop*/
-
-auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
-  int supportsCoopLaunch = 0;
-  return supportsCoopLaunch;
-}
+void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/
 
 struct Stream {
   Stream() {}
   ~Stream() {}
 
   auto& operator()() { return stream; };
-
   void sync() {}
 
   std::size_t stream = 0;
@@ -107,18 +101,16 @@ struct StreamEvent {
   StreamEvent(Stream&) {}
   ~StreamEvent() {}
 
-  auto& operator()() { return event; };
-  auto& record() {
-    ++stage;
+  auto& operator()(std::function<void()> fn = {}) {
+    fn();
     return *this;
   }
-  auto& wait() { return *this; }
-  bool finished() const { return stage == 2; }
-  void reset() { stage = 0; }
+
+  bool finished() const { return fin; }
 
   Stream stream;
-  std::size_t event = 0;
-  std::uint16_t stage = 0;
+  bool fin = 1;
+  std::function<void()> _fn;
 };
 
 template <typename T>
@@ -220,7 +212,7 @@ namespace detail {
 static thread_local std::size_t idx = 0;
 }
 
-template <bool _sync = true, bool _coop = false, typename F, typename... Args>
+template <bool _sync = true, typename F, typename... Args>
 void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
@@ -264,7 +256,7 @@ struct GLauncher : public Launcher {
   std::size_t count;
 };
 
-void prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
+void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
 
 }  // namespace MKN_GPU_NS
 
@@ -284,9 +276,12 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) {
   if (auto i = mkn::gpu::cpu::idx(); i < s) f(args...);
 }
 
-#include "launchers.hpp"
+template <typename F, typename... Args>
+static void global_d_kernel(F& f, Args... args) {
+  f(args...);
+}
 
-void grid_sync() {}
+#include "launchers.hpp"
 
 } /* namespace MKN_GPU_NS */
 

diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
@@ -39,15 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
+#include "mkn/gpu/def.hpp"
 #include "mkn/gpu/cli.hpp"
+
 #include <cuda_runtime.h>
-#include <cooperative_groups.h>
-#include "mkn/gpu/def.hpp"
 
 //
 
-#define MKN_GPU_ASSERT(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
+#define MKN_GPU_ASSERT(ans)               \
+  {                                       \
+    gpuAssert((ans), __FILE__, __LINE__); \
+  }
 inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
   if (code != cudaSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
@@ -84,17 +86,11 @@ __device__ SIZE block_idx_x() {
 
 namespace MKN_GPU_NS {
 
-void setLimitMallocHeapSize(std::size_t const& bytes) {
+void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
 }
 
-void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
-
-auto supportsCooperativeLaunch(int const dev = 0) {
-  int supportsCoopLaunch = 0;
-  MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
-  return supportsCoopLaunch;
-}
+void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
 
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
@@ -108,56 +104,39 @@ struct Stream {
   cudaStream_t stream;
 };
 
-struct StreamEvent {
-  StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
-  ~StreamEvent() { clear(); }
-
-  StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} {
-    that.start = nullptr;
-    that.stop = nullptr;
-  }
+//
 
+struct StreamEvent {
+  //
+  StreamEvent(Stream& stream_) : stream{stream_} {}
+  StreamEvent(StreamEvent&& that) = default;
   StreamEvent(StreamEvent const&) = delete;
   StreamEvent& operator=(StreamEvent const&) = delete;
 
-  auto& operator()() { return stop; };
-  auto& record() {
-    if (stage == 0) {
-      MKN_GPU_ASSERT(result = cudaEventRecord(start, stream()));
-      ++stage;
-    } else {
-      MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream()));
-      ++stage;
-    }
-    return *this;
-  }
-  auto& wait() {
-    if (stage == 0) {
-      MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), start));
-    } else {
-      MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), stop));
-    }
+  auto& operator()(std::function<void()> fn = {}) {
+    fin = 0;
+    _fn = fn;
+    MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
     return *this;
   }
 
-  void clear() {
-    if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start));
-    if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop));
-  }
-  bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; }
-  void reset() {
-    clear();
-    MKN_GPU_ASSERT(result = cudaEventCreate(&start));
-    MKN_GPU_ASSERT(result = cudaEventCreate(&stop));
-    stage = 0;
+  static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) {
+    auto& self = *reinterpret_cast<StreamEvent*>(ptr);
+    self._fn();
+    self._fn = [] {};
+    self.fin = 1;
   }
 
+  bool finished() const { return fin; }
+
   Stream& stream;
   cudaError_t result;
-  cudaEvent_t start = nullptr, stop = nullptr;
-  std::uint16_t stage = 0;
+  std::function<void()> _fn;
+  bool fin = 0;
 };
 
+//
+
 template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
@@ -264,19 +243,13 @@ void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(str
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
 
-template <bool _sync = true, bool _coop = false, typename F, typename... Args>
+template <bool _sync = true, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
       [&](auto&&... params) {
-        if constexpr (_coop) {
-          auto address_of = [](auto& a) { return (void*)&a; };
-          void* kernelArgs[] = {(address_of(params), ...)};
-          cudaLaunchCooperativeKernel((void*)f, g, b, kernelArgs, ds);
-        } else {
-          f<<<g, b, ds, s>>>(params...);
-        }
+        f<<<g, b, ds, s>>>(params...);
         MKN_GPU_ASSERT(cudaGetLastError());
       },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
@@ -331,6 +304,11 @@ __global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
   if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
 }
 
+template <typename F, typename... Args>
+__global__ static void global_d_kernel(F f, Args... args) {
+  f(args...);
+}
+
 #include "launchers.hpp"
 
 template <typename T, typename V>
@@ -373,14 +351,8 @@ void print_gpu_mem_used() {
          total_t, total_m, used_m);
 }
 
-__device__ void grid_sync() {
-  namespace cg = cooperative_groups;
-  cg::grid_group grid = cg::this_grid();
-  assert(grid.is_valid());
-  grid.sync();
-}
-
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
+
 #endif /* _MKN_GPU_CUDA_HPP_ */
diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp
@@ -23,11 +23,11 @@ static constexpr bool is_floating_point_v =
 #endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_    */
 
 #ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_
-#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ 10
+#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_ 1
 #endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_    */
 
 #ifndef _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_
-#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ 100
+#define _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_ 25
 #endif /*_MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_    */
 
 } /* namespace mkn::gpu */

diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp
@@ -1,10 +1,6 @@
 #ifndef _MKN_GPU_DEFINES_HPP_
 #define _MKN_GPU_DEFINES_HPP_
 
-#if !defined(MKN_GPU_FN_PER_NS)
-#define MKN_GPU_FN_PER_NS 0
-#endif
-
 #if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
 #define MKN_GPU_ROCM 1
 #endif
@@ -19,18 +15,27 @@
 #define MKN_GPU_CUDA 0
 #endif
 
-#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0
+#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && !defined(MKN_GPU_FN_PER_NS)
 #define MKN_GPU_FN_PER_NS 1
 #endif
 
+#if !defined(MKN_GPU_FN_PER_NS)
+#define MKN_GPU_FN_PER_NS 0
+#endif
+
 #if MKN_GPU_ROCM == 1
 #include "mkn/gpu/rocm.hpp"
 #endif
 
-#if MKN_GPU_CUDA
+#if MKN_GPU_CUDA == 1
 #include "mkn/gpu/cuda.hpp"
 #endif
 
+#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU)
+#pragma message("mkn.gpu error: No accelerator found, defaulting to CPU IMP")
+#define MKN_GPU_CPU 1
+#endif
+
 #if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
 #include "mkn/gpu/cpu.hpp"
 #endif