cleanup

mkn · Nov 12, 2024 · d88605d · d88605d
1 parent 47e58b6
commit d88605d
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 121 deletions.
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
@@ -83,14 +83,9 @@ struct dim3 {
   std::size_t x = 1, y = 1, z = 1;
 };
 
-void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
+void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
 
-void setDevice(std::size_t const& /*dev*/) {} /*noop*/
-
-auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
-  int supportsCoopLaunch = 0;
-  return supportsCoopLaunch;
-}
+void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/
 
 struct Stream {
   Stream() {}
@@ -217,7 +212,7 @@ namespace detail {
 static thread_local std::size_t idx = 0;
 }
 
-template <bool _sync = true, bool _coop = false, typename F, typename... Args>
+template <bool _sync = true, typename F, typename... Args>
 void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
@@ -261,7 +256,7 @@ struct GLauncher : public Launcher {
   std::size_t count;
 };
 
-void prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
+void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
 
 }  // namespace MKN_GPU_NS
 
@@ -281,9 +276,12 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) {
   if (auto i = mkn::gpu::cpu::idx(); i < s) f(args...);
 }
 
-#include "launchers.hpp"
+template <typename F, typename... Args>
+static void global_d_kernel(F& f, std::size_t s, Args... args) {
+  f(args...);
+}
 
-void grid_sync() {}
+#include "launchers.hpp"
 
 } /* namespace MKN_GPU_NS */
 

diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
@@ -39,10 +39,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
+#include "mkn/gpu/def.hpp"
 #include "mkn/gpu/cli.hpp"
+
 #include <cuda_runtime.h>
-#include <cooperative_groups.h>
-#include "mkn/gpu/def.hpp"
 
 //
 
@@ -86,17 +86,11 @@ __device__ SIZE block_idx_x() {
 
 namespace MKN_GPU_NS {
 
-void setLimitMallocHeapSize(std::size_t const& bytes) {
+void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
 }
 
-void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
-
-auto supportsCooperativeLaunch(int const dev = 0) {
-  int supportsCoopLaunch = 0;
-  MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
-  return supportsCoopLaunch;
-}
+void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
 
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
@@ -249,19 +243,13 @@ void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(str
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
 
-template <bool _sync = true, bool _coop = false, typename F, typename... Args>
+template <bool _sync = true, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
       [&](auto&&... params) {
-        if constexpr (_coop) {
-          auto address_of = [](auto& a) { return (void*)&a; };
-          void* kernelArgs[] = {(address_of(params), ...)};
-          cudaLaunchCooperativeKernel((void*)f, g, b, kernelArgs, ds);
-        } else {
-          f<<<g, b, ds, s>>>(params...);
-        }
+        f<<<g, b, ds, s>>>(params...);
         MKN_GPU_ASSERT(cudaGetLastError());
       },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
@@ -316,6 +304,11 @@ __global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
   if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
 }
 
+template <typename F, typename... Args>
+__global__ static void global_d_kernel(F f, std::size_t s, Args... args) {
+  f(args...);
+}
+
 #include "launchers.hpp"
 
 template <typename T, typename V>
@@ -358,14 +351,8 @@ void print_gpu_mem_used() {
          total_t, total_m, used_m);
 }
 
-__device__ void grid_sync() {
-  namespace cg = cooperative_groups;
-  cg::grid_group grid = cg::this_grid();
-  assert(grid.is_valid());
-  grid.sync();
-}
-
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
+
 #endif /* _MKN_GPU_CUDA_HPP_ */
diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp
@@ -1,10 +1,6 @@
 #ifndef _MKN_GPU_DEFINES_HPP_
 #define _MKN_GPU_DEFINES_HPP_
 
-#if !defined(MKN_GPU_FN_PER_NS)
-#define MKN_GPU_FN_PER_NS 0
-#endif
-
 #if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
 #define MKN_GPU_ROCM 1
 #endif
@@ -19,18 +15,27 @@
 #define MKN_GPU_CUDA 0
 #endif
 
-#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0
+#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && !defined(MKN_GPU_FN_PER_NS)
 #define MKN_GPU_FN_PER_NS 1
 #endif
 
+#if !defined(MKN_GPU_FN_PER_NS)
+#define MKN_GPU_FN_PER_NS 0
+#endif
+
 #if MKN_GPU_ROCM == 1
 #include "mkn/gpu/rocm.hpp"
 #endif
 
-#if MKN_GPU_CUDA
+#if MKN_GPU_CUDA == 1
 #include "mkn/gpu/cuda.hpp"
 #endif
 
+#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU)
+#pragma message("mkn.gpu error: No accelerator found, defaulting to CPU IMP")
+#define MKN_GPU_CPU 1
+#endif
+
 #if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
 #include "mkn/gpu/cpu.hpp"
 #endif

diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
@@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_LAUNCHERS_HPP_
 #define _MKN_GPU_LAUNCHERS_HPP_
 
-template <bool _sync = true, bool _coop = false>
+template <bool _sync = true>
 struct GDLauncher : public GLauncher {
   GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}
 
@@ -58,27 +58,26 @@ struct GDLauncher : public GLauncher {
 
   template <typename S, typename F, typename... PArgs, typename... Args>
   void _launch(S& _s, F& f, std::tuple<PArgs&...>*, Args&&... args) {
-    MKN_GPU_NS::launch<_sync, _coop>(&global_gd_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
+    MKN_GPU_NS::launch<_sync>(&global_gd_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
   }
 };
 
-
-template <bool _sync = false, bool _coop = false>
+template <bool _sync = false>
 struct DLauncher : public Launcher {
-  DLauncher(size_t dev = 0) : Launcher{{}, {}} {}
+  DLauncher(size_t /*dev*/ = 0) : Launcher{{}, {}} {}
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {
     _launch(s, f,
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
-             args...);
+            args...);
   }
 
   template <typename F, typename... Args>
   auto stream(Stream& s, F&& f, Args&&... args) {
     _launch(s.stream, f,
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
-             args...);
+            args...);
   }
 
  protected:
@@ -90,9 +89,10 @@ struct DLauncher : public Launcher {
 
   template <typename S, typename F, typename... PArgs, typename... Args>
   void _launch(S& _s, F& f, std::tuple<PArgs&...>*, Args&&... args) {
-    MKN_GPU_NS::launch<_sync, _coop>(&global_d_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
+    MKN_GPU_NS::launch<_sync>(&global_d_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
   }
-};
 
+  //
+};
 
 #endif /* _MKN_GPU_LAUNCHERS_HPP_ */
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
@@ -37,13 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
-#include "mkn/gpu/cli.hpp"
-#include "hip/hip_runtime.h"
 #include "mkn/gpu/def.hpp"
+#include "mkn/gpu/cli.hpp"
 
-#include "hip/hip_cooperative_groups.h"
-
-// #define MKN_GPU_ASSERT(x) (KASSERT((x) == hipSuccess))
+#include "hip/hip_runtime.h"
 
 #define MKN_GPU_ASSERT(ans)               \
   {                                       \
@@ -81,18 +78,11 @@ __device__ SIZE idx() {
 
 namespace MKN_GPU_NS {
 
-void setLimitMallocHeapSize(std::size_t const& bytes) {
+void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
 }
 
-void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); }
-
-auto supportsCooperativeLaunch(int const dev = 0) {
-  int supportsCoopLaunch = 0;
-  MKN_GPU_ASSERT(
-      hipDeviceGetAttribute(&supportsCoopLaunch, hipDeviceAttributeCooperativeLaunch, dev));
-  return supportsCoopLaunch;
-}
+void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); }
 
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); }
@@ -259,19 +249,13 @@ void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(strea
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
 
-template <bool _sync = true, bool _coop = false, typename F, typename... Args>
+template <bool _sync = true, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
       [&](auto&&... params) {
-        if constexpr (_coop) {
-          auto address_of = [](auto& a) { return (void*)&a; };
-          void* kernelArgs[] = {(address_of(params), ...)};
-          MKN_GPU_ASSERT(hipLaunchCooperativeKernel<F>(f, g, b, kernelArgs, ds, s));
-        } else {
-          hipLaunchKernelGGL(f, g, b, ds, s, params...);
-        }
+        hipLaunchKernelGGL(f, g, b, ds, s, params...);
         MKN_GPU_ASSERT(hipGetLastError());
       },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
@@ -367,14 +351,8 @@ void print_gpu_mem_used() {
          total_t, total_m, used_m);
 }
 
-__device__ void grid_sync() {
-  namespace cg = cooperative_groups;
-  cg::grid_group grid = cg::this_grid();
-  assert(grid.is_valid());
-  grid.sync();
-}
-
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
+
 #endif /* _MKN_GPU_ROCM_HPP_ */
diff --git a/test/any/coop.cpp b/test/any/coop.cpp