updates (#17)

* updates
mkn · Nov 12, 2023 · 8b44c8c · 8b44c8c
1 parent eb6b1a2
commit 8b44c8c
Show file tree

Hide file tree

Showing 13 changed files with 274 additions and 86 deletions.
diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/alloc.hpp
@@ -70,4 +70,20 @@ class ManagedAllocator {
   }
 };
 
+template <typename T, typename Size>
+void copy(T* const dst, T const* const src, Size size) {
+  auto dst_p = Pointer{dst};
+  auto src_p = Pointer{src};
+
+  bool to_send = dst_p.is_device_ptr() && src_p.is_host_ptr();
+  bool to_take = dst_p.is_host_ptr() && src_p.is_device_ptr();
+
+  if (to_send)
+    send(dst, src, size);
+  else if (to_take)
+    take(dst, src, size);
+  else
+    throw std::runtime_error("Unsupported operation (PR welcome)");
+}
+
 #endif /* _MKN_GPU_ALLOC_HPP_ */
diff --git a/inc/mkn/gpu/asio.hpp b/inc/mkn/gpu/asio.hpp
@@ -134,7 +134,7 @@ struct BatchMaker {
 };
 
 template <typename F, typename... Args>
-__global__ static void global_kernel(F&& f, std::uint32_t max, std::uint32_t batch_index,
+__global__ static void global_kernel(F f, std::uint32_t max, std::uint32_t batch_index,
                                      Args... args) {
   if (auto bi = mkn::gpu::idx(); bi < max) f(bi + (max * batch_index), args...);
 }
@@ -146,7 +146,7 @@ class Launcher {
   Launcher(std::size_t tpx, std::size_t n_batches_ = 1) : Launcher{dim3(), dim3(tpx), n_batches_} {}
 
   template <typename F, typename Batch_t>
-  void launch(F&& f, Batch_t& batch) {
+  void launch(F& f, Batch_t& batch) {
     auto const& streamSize = batch.streamSize;
 
     g.x = streamSize / b.x;
@@ -185,13 +185,13 @@ class Launcher {
   }
 
   template <typename F, typename... PArgs, typename... Args>
-  void _launch(F&& f, std::tuple<PArgs&...>*, Stream& stream, std::uint32_t max,
+  void _launch(F& f, std::tuple<PArgs&...>*, Stream& stream, std::uint32_t max,
                std::uint32_t offset, Args&&... args) {
-    MKN_GPU_NS::launch(global_kernel<F&&, PArgs...>, g, b, ds, stream(), f, max, offset, args...);
+    MKN_GPU_NS::launch(global_kernel<F, PArgs...>, g, b, ds, stream(), f, max, offset, args...);
   }
 
   template <typename F, typename... Args>
-  void launch(Stream& stream, F&& f, std::uint32_t max, std::uint32_t offset, Args&&... args) {
+  void launch(Stream& stream, F& f, std::uint32_t max, std::uint32_t offset, Args&&... args) {
     _launch(f,
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
             stream, max, offset, args...);

diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "mkn/gpu/def.hpp"
 
+#include <cassert>
 #include <cstring>
 
 #define MKN_GPU_ASSERT(x) (KASSERT((x)))
@@ -64,14 +65,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define __host__
 #define __global__
 
-namespace mkn::gpu {
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-namespace cpu {
 #define MKN_GPU_NS mkn::gpu::cpu
 #else
 #define MKN_GPU_NS mkn::gpu
 #endif  // MKN_GPU_FN_PER_NS
 
+namespace MKN_GPU_NS {
+
 struct dim3 {
   dim3() {}
   dim3(std::size_t x) : x{x} {}
@@ -186,12 +187,15 @@ static thread_local std::size_t idx = 0;
 }
 
 template <typename F, typename... Args>
-void launch(F&& f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
+void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
       [&](auto&&... params) {
-        for (std::size_t i = 0; i < N; ++i) f(params...);
+        for (std::size_t i = 0; i < N; ++i) {
+          f(params...);
+          detail::idx++;
+        }
       },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
 
@@ -217,28 +221,38 @@ struct Launcher {
 };
 
 struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, [[maybe_unused]] size_t dev = 0) : Launcher{dim3{}, dim3{}} {
+  GLauncher(std::size_t s, [[maybe_unused]] size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
     b.x = 1024;
     g.x = s / b.x;
     if ((s % b.x) > 0) ++g.x;
   }
+
+  std::size_t count;
 };
 
-void prinfo([[maybe_unused]] std::size_t dev = 0) { KOUT(NON) << "Psuedo GPU in use"; }
+void prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
 
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-} /* namespace cuda */
-#endif  // MKN_GPU_FN_PER_NS
-} /* namespace mkn::gpu */
+}  // namespace MKN_GPU_NS
 
 namespace mkn::gpu::cpu {
 
 template <typename SIZE = std::uint32_t /*max 4294967296*/>
 SIZE idx() {
-  return MKN_GPU_NS::detail::idx++;
+  return MKN_GPU_NS::detail::idx;
 }
 
 }  // namespace mkn::gpu::cpu
 
+namespace MKN_GPU_NS {
+
+template <typename F, typename... Args>
+static void global_gd_kernel(F& f, std::size_t s, Args... args) {
+  if (auto i = mkn::gpu::cpu::idx(); i < s) f(args...);
+}
+
+#include "launchers.hpp"
+
+} /* namespace MKN_GPU_NS */
+
 #undef MKN_GPU_ASSERT
 #endif /* _MKN_PSUEDO_GPU_HPP_ */
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
@@ -54,14 +54,14 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort =
   }
 }
 
-namespace mkn::gpu {
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-namespace cuda {
 #define MKN_GPU_NS mkn::gpu::cuda
 #else
 #define MKN_GPU_NS mkn::gpu
 #endif  // MKN_GPU_FN_PER_NS
 
+namespace MKN_GPU_NS {
+
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
   ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); }
@@ -210,7 +210,7 @@ struct Launcher {
 
   template <typename F, typename... Args>
   void operator()(F&& f, Args&&... args) {
-    launch(f, g, b, ds, s, args...);
+    launch(std::forward<F>(f), g, b, ds, s, args...);
   }
 
   size_t ds = 0 /*dynamicShared*/;
@@ -219,17 +219,25 @@ struct Launcher {
 };
 
 struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}} {
+  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
     [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
 
     b.x = devProp.maxThreadsPerBlock;
     g.x = s / b.x;
     if ((s % b.x) > 0) ++g.x;
   }
 
+  std::size_t count = 0;
   cudaDeviceProp devProp;
 };
 
+template <typename F, typename... Args>
+__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
+  if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
+}
+
+#include "launchers.hpp"
+
 template <typename T, typename V>
 __global__ void _vector_fill(T* a, V t, std::size_t s) {
   if (auto i = mkn::gpu::cuda::idx(); i < s) a[i] = t;
@@ -259,10 +267,7 @@ void prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-} /* namespace cuda */
-#endif  // MKN_GPU_FN_PER_NS
-} /* namespace mkn::gpu */
+}  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
 #endif /* _MKN_GPU_CUDA_HPP_ */
diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
@@ -0,0 +1,57 @@
+/**
+Copyright (c) 2020, Philip Deegan.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Philip Deegan nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _MKN_GPU_LAUNCHERS_HPP_
+#define _MKN_GPU_LAUNCHERS_HPP_
+
+struct GDLauncher : public GLauncher {
+  GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}
+
+  template <typename F, typename... Args>
+  auto operator()(F&& f, Args&&... args) {
+    _launch(std::forward<F>(f),
+            as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
+            count, args...);
+  }
+
+ protected:
+  template <std::size_t... I, typename... Args>
+  auto as_values(std::tuple<Args&...>&& tup, std::index_sequence<I...>) {
+    using T = std::tuple<decltype(MKN_GPU_NS::replace(std::get<I>(tup)))&...>*;
+    return T{nullptr};
+  }
+
+  template <typename F, typename... PArgs, typename... Args>
+  void _launch(F&& f, std::tuple<PArgs&...>*, Args&&... args) {
+    MKN_GPU_NS::launch(&global_gd_kernel<F, PArgs...>, g, b, ds, s, f, args...);
+  }
+};
+
+#endif /* _MKN_GPU_LAUNCHERS_HPP_ */
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
@@ -52,14 +52,14 @@ inline void gpuAssert(hipError_t code, const char* file, int line, bool abort =
   }
 }
 
-namespace mkn::gpu {
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-namespace hip {
 #define MKN_GPU_NS mkn::gpu::hip
 #else
 #define MKN_GPU_NS mkn::gpu
 #endif  // MKN_GPU_FN_PER_NS
 
+namespace MKN_GPU_NS {
+
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); }
   ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); }
@@ -76,12 +76,10 @@ template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); }
 
-  bool is_unregistered_ptr() const { return attributes.type == 0; }
-  bool is_host_ptr() const {
-    return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr);
-  }
-  bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; }
-  bool is_managed_ptr() const { return attributes.type == 3; }
+  // bool is_unregistered_ptr() const { return attributes.type == 0; }
+  bool is_host_ptr() const { return attributes.hostPointer != nullptr; }
+  bool is_device_ptr() const { return attributes.devicePointer != nullptr; }
+  bool is_managed_ptr() const { return attributes.isManaged; }
 
   T* t;
   hipPointerAttribute_t attributes;
@@ -173,7 +171,7 @@ void sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); }
 #include "mkn/gpu/device.hpp"
 
 template <typename F, typename... Args>
-void launch(F f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
+void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
@@ -191,8 +189,8 @@ struct Launcher {
       : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
 
   template <typename F, typename... Args>
-  void operator()(F f, Args&&... args) {
-    launch(f, g, b, ds, s, args...);
+  void operator()(F&& f, Args&&... args) {
+    launch(std::forward<F>(f), g, b, ds, s, args...);
   }
 
   size_t ds = 0 /*dynamicShared*/;
@@ -201,17 +199,40 @@ struct Launcher {
 };
 
 struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}} {
+  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
     [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
 
     b.x = devProp.maxThreadsPerBlock;
     g.x = s / b.x;
     if ((s % b.x) > 0) ++g.x;
   }
 
+  std::size_t count = 0;
   hipDeviceProp_t devProp;
 };
 
+template <typename F, typename... Args>
+__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
+  if (auto i = mkn::gpu::hip::idx(); i < s) f(args...);
+}
+
+#include "launchers.hpp"
+
+template <typename T, typename V>
+__global__ void _vector_fill(T* a, V t, std::size_t s) {
+  if (auto i = mkn::gpu::hip::idx(); i < s) a[i] = t;
+}
+
+template <typename Container, typename T>
+void fill(Container& c, size_t size, T val) {
+  GLauncher{c.size()}(_vector_fill<typename Container::value_type, T>, c.data(), val, size);
+}
+
+template <typename Container, typename T>
+void fill(Container& c, T val) {
+  GLauncher{c.size()}(_vector_fill<typename Container::value_type, T>, c.data(), val, c.size());
+}
+
 // https://rocm-developer-tools.github.io/HIP/group__Device.html
 void prinfo(size_t dev = 0) {
   hipDeviceProp_t devProp;
@@ -226,10 +247,7 @@ void prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-} /* namespace hip */
-#endif  // MKN_GPU_FN_PER_NS
-} /* namespace mkn::gpu */
+}  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
 #endif /* _MKN_GPU_ROCM_HPP_ */
diff --git a/mkn.yaml b/mkn.yaml
@@ -25,6 +25,7 @@ profile:
   parent: headers
   arg: -DMKN_GPU_CPU
   test: test/any/(\w).cpp
+        test/cpu/(\w).cpp
 
 - name: format
   mod: |