mkn · PhilipDeegan · Aug 3, 2024 · Jul 12, 2024 · Jul 14, 2024 · Jul 17, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: test
       run: |
         curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix
         chmod +x mkn
-        KLOG=3 ./mkn clean build run -dtKOp cpu -a "-std=c++17" -g 0 test -W 9
+        KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++17" test -W 9
diff --git a/.sublime-project b/.sublime-project
@@ -9,7 +9,7 @@
 	{
 		"ClangFormat":
 		{
-			"binary": "clang-format-15",
+			"binary": "clang-format",
 			"format_on_save": true,
 			"style": "file"
 		},

diff --git a/LICENSE.md b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/inc/mkn/gpu.hpp b/inc/mkn/gpu.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,27 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_HPP_
 #define _MKN_GPU_HPP_
 
-#if defined(MKN_GPU_ROCM)
-#include "mkn/gpu/rocm.hpp"
-#elif defined(MKN_GPU_CUDA)
-#include "mkn/gpu/cuda.hpp"
-#elif defined(MKN_GPU_CPU)
-#include "mkn/gpu/cpu.hpp"
-#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0
-#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
-#endif
+#include "mkn/gpu/defines.hpp"
 
 namespace mkn::gpu {
 
 __device__ uint32_t idx() {
-#if defined(MKN_GPU_ROCM)
+#if MKN_GPU_ROCM
   return mkn::gpu::hip::idx();
-#elif defined(MKN_GPU_CUDA)
+
+#elif MKN_GPU_CUDA
   return mkn::gpu::cuda::idx();
-#elif defined(MKN_GPU_CPU)
+
+#elif MKN_GPU_CPU
   return mkn::gpu::cpu::idx();
+
 #else
 #error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
+
 #endif
 }
 

diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/alloc.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,9 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_ALLOC_HPP_
 #define _MKN_GPU_ALLOC_HPP_
 
-template <typename T, std::int32_t alignment = 32>
-class ManagedAllocator {
-  using This = ManagedAllocator<T, alignment>;
+template <typename T, std::int32_t alignment>
+class MknGPUAllocator {
+  using This = MknGPUAllocator<T, alignment>;
 
  public:
   using pointer = T*;
@@ -44,7 +44,7 @@ class ManagedAllocator {
 
   template <typename U>
   struct rebind {
-    using other = ManagedAllocator<U, alignment>;
+    using other = MknGPUAllocator<U, alignment>;
   };
 
   T* allocate(std::size_t const n) const {
@@ -70,20 +70,91 @@ class ManagedAllocator {
   }
 };
 
+template <typename T, std::int32_t alignment = 32>
+class NoConstructAllocator : public MknGPUAllocator<T, alignment> {
+ public:
+  template <typename U>
+  struct rebind {
+    using other = NoConstructAllocator<U, alignment>;
+  };
+
+  template <typename U, typename... Args>
+  void construct(U* /*ptr*/, Args&&... /*args*/) {}  // nothing
+  template <typename U>
+  void construct(U* /*ptr*/) noexcept(std::is_nothrow_default_constructible<U>::value) {}
+};
+
+template <typename T, std::int32_t align>
+std::vector<T, MknGPUAllocator<T, align>>& as_super(
+    std::vector<T, NoConstructAllocator<T, align>>& v) {
+  return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
+}
+
+template <typename T, std::int32_t alignment = 32>
+class ManagedAllocator : public MknGPUAllocator<T, alignment> {
+ public:
+  template <typename U>
+  struct rebind {
+    using other = ManagedAllocator<U, alignment>;
+  };
+};
+
+template <typename T, std::int32_t align>
+std::vector<T, MknGPUAllocator<T, align>>& as_super(std::vector<T, ManagedAllocator<T, align>>& v) {
+  return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
+}
+
 template <typename T, typename Size>
-void copy(T* const dst, T const* const src, Size size) {
-  auto dst_p = Pointer{dst};
-  auto src_p = Pointer{src};
+void copy(T* dst, T* src, Size size) {
+  assert(dst and src);
 
-  bool to_send = dst_p.is_device_ptr() && src_p.is_host_ptr();
-  bool to_take = dst_p.is_host_ptr() && src_p.is_device_ptr();
+  Pointer src_p{src};
+  Pointer dst_p{dst};
 
-  if (to_send)
+  auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); };
+  auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); };
+  auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); };
+  auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); };
+
+  if (on_host())
+    std::copy(src, src + size, dst);
+  else if (on_device())
+    copy_on_device(dst, src, size);
+  else if (to_send())
     send(dst, src, size);
-  else if (to_take)
-    take(dst, src, size);
+  else if (to_take())
+    take(src, dst, size);
   else
     throw std::runtime_error("Unsupported operation (PR welcome)");
 }
 
+template <typename T, std::int32_t align>
+auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
+              bool mem_copy = true) {
+  if (s <= v.capacity()) {
+    v.reserve(s);
+    return v;
+  }
+  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  cpy.reserve(s);
+  cpy.resize(v.size());
+  if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
+  v = std::move(cpy);
+  return v;
+}
+
+template <typename T, std::int32_t align>
+auto& resize(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
+             bool mem_copy = true) {
+  if (s <= v.capacity()) {
+    v.resize(s);
+    return v;
+  }
+  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  cpy.resize(s);
+  if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
+  v = std::move(cpy);
+  return v;
+}
+
 #endif /* _MKN_GPU_ALLOC_HPP_ */
diff --git a/inc/mkn/gpu/asio.hpp b/inc/mkn/gpu/asio.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp
@@ -0,0 +1,67 @@
+/**
+Copyright (c) 2024, Philip Deegan.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Philip Deegan nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+// IWYU pragma: private, include "mkn/gpu.hpp"
+#ifndef _MKN_GPU_CLI_HPP_
+#define _MKN_GPU_CLI_HPP_
+
+#include <optional>
+#include <type_traits>
+
+#include "mkn/kul/env.hpp"
+
+namespace mkn::gpu {
+
+template <typename Device>
+struct Cli {
+  //
+
+  auto bx_threads() const {
+    char const* ENV = "MKN_GPU_BX_THREADS";
+    if (mkn::kul::env::EXISTS(ENV)) {
+      return as<std::int32_t>(mkn::kul::env::GET(ENV));
+    }
+    return dev.maxThreadsPerBlock;
+  }
+
+  template <typename T>
+  auto static as(std::string const& from) {
+    T t;
+    std::stringstream ss(from);
+    ss >> t;
+    return t;
+  }
+
+  Device const& dev;
+};
+
+} /* namespace mkn::gpu */
+
+#endif /*_MKN_GPU_CLI_HPP_*/
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/assert.hpp"
 #include "mkn/kul/threads.hpp"
 
+#include "mkn/gpu/cli.hpp"
 #include "mkn/gpu/def.hpp"
 
 #include <cassert>
@@ -82,6 +83,15 @@ struct dim3 {
   std::size_t x = 1, y = 1, z = 1;
 };
 
+void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {
+  // noop
+}
+
+auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
+  int supportsCoopLaunch = 0;
+  return supportsCoopLaunch;
+}
+
 struct Stream {
   Stream() {}
   ~Stream() {}
@@ -93,6 +103,19 @@ struct Stream {
   std::size_t stream = 0;
 };
 
+struct StreamEvent {
+  StreamEvent(Stream&) {}
+  ~StreamEvent() {}
+
+  auto& operator()() { return event; };
+  void record() { ; }
+  bool finished() const { return true; }
+  void reset() {}
+
+  Stream stream;
+  std::size_t event = 0;
+};
+
 template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} {}
@@ -129,7 +152,7 @@ void alloc_managed(T*& p, Size size) {
   MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
 }
 
-void destroy(void* p) {
+void inline destroy(void* p) {
   KLOG(TRC);
   std::free(p);
 }
@@ -146,6 +169,12 @@ void destroy_host(T*& p) {
   std::free(p);
 }
 
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T)));
+}
+
 template <typename Size>
 void send(void* p, void* t, Size size = 1) {
   KLOG(TRC);
@@ -177,7 +206,7 @@ void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) {
   take(p, span.data(), span.size(), start);
 }
 
-void sync() {}
+void inline sync() {}
 
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
@@ -186,7 +215,7 @@ namespace detail {
 static thread_local std::size_t idx = 0;
 }
 
-template <typename F, typename... Args>
+template <bool _sync = true, bool _coop = false, typename F, typename... Args>
 void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
@@ -252,6 +281,8 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) {
 
 #include "launchers.hpp"
 
+void grid_sync() {}
+
 } /* namespace MKN_GPU_NS */
 
 #undef MKN_GPU_ASSERT