From f3d35665cb255e6fea60f6fa7b89def871d0133e Mon Sep 17 00:00:00 2001
From: dekken <philip.deegan@gmail.com>
Date: Fri, 12 Jul 2024 22:28:32 +0200
Subject: [PATCH 1/7] betterment

---
 .github/workflows/build.yml  |   4 +-
 .sublime-project             |   2 +-
 LICENSE.md                   |   2 +-
 inc/mkn/gpu.hpp              |  22 ++---
 inc/mkn/gpu/alloc.hpp        |  97 ++++++++++++++++++---
 inc/mkn/gpu/asio.hpp         |   2 +-
 inc/mkn/gpu/cli.hpp          |  67 +++++++++++++++
 inc/mkn/gpu/cpu.hpp          |  39 ++++++++-
 inc/mkn/gpu/cuda.hpp         | 127 ++++++++++++++++++++-------
 inc/mkn/gpu/cuda/def.hpp     |  24 ------
 inc/mkn/gpu/def.hpp          |  12 ---
 inc/mkn/gpu/defines.hpp      |  53 ++++++++++++
 inc/mkn/gpu/device.hpp       |   5 +-
 inc/mkn/gpu/launchers.hpp    |  18 ++--
 inc/mkn/gpu/multi_launch.hpp | 160 +++++++++++++++++++++++++++++++++++
 inc/mkn/gpu/rocm.hpp         | 122 ++++++++++++++++++++++----
 inc/mkn/gpu/rocm/def.hpp     |  24 ------
 inc/mkn/gpu/tuple.hpp        |   2 +-
 mkn.yaml                     |   6 +-
 res/mkn/clang_cuda.yaml      |   5 ++
 res/mkn/hipcc.yaml           |   6 +-
 test/any/add.cpp             |  10 +++
 test/any/async_streaming.cpp |  45 ++++++++++
 test/any/construct.cpp       |  59 +++++++++++++
 test/any/coop.cpp            |  41 +++++++++
 test/any/managed.cpp         |   4 +
 26 files changed, 804 insertions(+), 154 deletions(-)
 create mode 100644 inc/mkn/gpu/cli.hpp
 delete mode 100644 inc/mkn/gpu/cuda/def.hpp
 create mode 100644 inc/mkn/gpu/defines.hpp
 create mode 100644 inc/mkn/gpu/multi_launch.hpp
 delete mode 100644 inc/mkn/gpu/rocm/def.hpp
 create mode 100644 test/any/async_streaming.cpp
 create mode 100644 test/any/construct.cpp
 create mode 100644 test/any/coop.cpp

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c5912d3..810b748 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: test
       run: |
         curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix
         chmod +x mkn
-        KLOG=3 ./mkn clean build run -dtKOp cpu -a "-std=c++17" -g 0 test -W 9
+        KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++17" test -W 9
diff --git a/.sublime-project b/.sublime-project
index 2fda01d..45091a2 100644
--- a/.sublime-project
+++ b/.sublime-project
@@ -9,7 +9,7 @@
 	{
 		"ClangFormat":
 		{
-			"binary": "clang-format-15",
+			"binary": "clang-format",
 			"format_on_save": true,
 			"style": "file"
 		},
diff --git a/LICENSE.md b/LICENSE.md
index baaa780..97bfc8f 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/inc/mkn/gpu.hpp b/inc/mkn/gpu.hpp
index 148c7f6..1f6c813 100644
--- a/inc/mkn/gpu.hpp
+++ b/inc/mkn/gpu.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,27 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_HPP_
 #define _MKN_GPU_HPP_
 
-#if defined(MKN_GPU_ROCM)
-#include "mkn/gpu/rocm.hpp"
-#elif defined(MKN_GPU_CUDA)
-#include "mkn/gpu/cuda.hpp"
-#elif defined(MKN_GPU_CPU)
-#include "mkn/gpu/cpu.hpp"
-#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0
-#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
-#endif
+#include "mkn/gpu/defines.hpp"
 
 namespace mkn::gpu {
 
 __device__ uint32_t idx() {
-#if defined(MKN_GPU_ROCM)
+#if MKN_GPU_ROCM
   return mkn::gpu::hip::idx();
-#elif defined(MKN_GPU_CUDA)
+
+#elif MKN_GPU_CUDA
   return mkn::gpu::cuda::idx();
-#elif defined(MKN_GPU_CPU)
+
+#elif MKN_GPU_CPU
   return mkn::gpu::cpu::idx();
+
 #else
 #error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
+
 #endif
 }
 
diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/alloc.hpp
index b442240..da2441c 100644
--- a/inc/mkn/gpu/alloc.hpp
+++ b/inc/mkn/gpu/alloc.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,9 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_ALLOC_HPP_
 #define _MKN_GPU_ALLOC_HPP_
 
-template <typename T, std::int32_t alignment = 32>
-class ManagedAllocator {
-  using This = ManagedAllocator<T, alignment>;
+template <typename T, std::int32_t alignment>
+class MknGPUAllocator {
+  using This = MknGPUAllocator<T, alignment>;
 
  public:
   using pointer = T*;
@@ -44,7 +44,7 @@ class ManagedAllocator {
 
   template <typename U>
   struct rebind {
-    using other = ManagedAllocator<U, alignment>;
+    using other = MknGPUAllocator<U, alignment>;
   };
 
   T* allocate(std::size_t const n) const {
@@ -70,20 +70,91 @@ class ManagedAllocator {
   }
 };
 
+template <typename T, std::int32_t alignment = 32>
+class NoConstructAllocator : public MknGPUAllocator<T, alignment> {
+ public:
+  template <typename U>
+  struct rebind {
+    using other = NoConstructAllocator<U, alignment>;
+  };
+
+  template <typename U, typename... Args>
+  void construct(U* /*ptr*/, Args&&... /*args*/) {}  // nothing
+  template <typename U>
+  void construct(U* /*ptr*/) noexcept(std::is_nothrow_default_constructible<U>::value) {}
+};
+
+template <typename T, std::int32_t align>
+std::vector<T, MknGPUAllocator<T, align>>& as_super(
+    std::vector<T, NoConstructAllocator<T, align>>& v) {
+  return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
+}
+
+template <typename T, std::int32_t alignment = 32>
+class ManagedAllocator : public MknGPUAllocator<T, alignment> {
+ public:
+  template <typename U>
+  struct rebind {
+    using other = ManagedAllocator<U, alignment>;
+  };
+};
+
+template <typename T, std::int32_t align>
+std::vector<T, MknGPUAllocator<T, align>>& as_super(std::vector<T, ManagedAllocator<T, align>>& v) {
+  return *reinterpret_cast<std::vector<T, MknGPUAllocator<T, align>>*>(&v);
+}
+
 template <typename T, typename Size>
-void copy(T* const dst, T const* const src, Size size) {
-  auto dst_p = Pointer{dst};
-  auto src_p = Pointer{src};
+void copy(T* dst, T* src, Size size) {
+  assert(dst and src);
 
-  bool to_send = dst_p.is_device_ptr() && src_p.is_host_ptr();
-  bool to_take = dst_p.is_host_ptr() && src_p.is_device_ptr();
+  Pointer src_p{src};
+  Pointer dst_p{dst};
 
-  if (to_send)
+  auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); };
+  auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); };
+  auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); };
+  auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); };
+
+  if (on_host())
+    std::copy(src, src + size, dst);
+  else if (on_device())
+    copy_on_device(dst, src, size);
+  else if (to_send())
     send(dst, src, size);
-  else if (to_take)
-    take(dst, src, size);
+  else if (to_take())
+    take(src, dst, size);
   else
     throw std::runtime_error("Unsupported operation (PR welcome)");
 }
 
+template <typename T, std::int32_t align>
+auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
+              bool mem_copy = true) {
+  if (s <= v.capacity()) {
+    v.reserve(s);
+    return v;
+  }
+  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  cpy.reserve(s);
+  cpy.resize(v.size());
+  if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
+  v = std::move(cpy);
+  return v;
+}
+
+template <typename T, std::int32_t align>
+auto& resize(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
+             bool mem_copy = true) {
+  if (s <= v.capacity()) {
+    v.resize(s);
+    return v;
+  }
+  std::vector<T, NoConstructAllocator<T, align>> cpy{NoConstructAllocator<T, align>{}};
+  cpy.resize(s);
+  if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size());
+  v = std::move(cpy);
+  return v;
+}
+
 #endif /* _MKN_GPU_ALLOC_HPP_ */
diff --git a/inc/mkn/gpu/asio.hpp b/inc/mkn/gpu/asio.hpp
index e9be973..c2795da 100644
--- a/inc/mkn/gpu/asio.hpp
+++ b/inc/mkn/gpu/asio.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp
new file mode 100644
index 0000000..bae675c
--- /dev/null
+++ b/inc/mkn/gpu/cli.hpp
@@ -0,0 +1,67 @@
+/**
+Copyright (c) 2024, Philip Deegan.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Philip Deegan nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+// IWYU pragma: private, include "mkn/gpu.hpp"
+#ifndef _MKN_GPU_CLI_HPP_
+#define _MKN_GPU_CLI_HPP_
+
+#include <optional>
+#include <type_traits>
+
+#include "mkn/kul/env.hpp"
+
+namespace mkn::gpu {
+
+template <typename Device>
+struct Cli {
+  //
+
+  auto bx_threads() const {
+    char const* ENV = "MKN_GPU_BX_THREADS";
+    if (mkn::kul::env::EXISTS(ENV)) {
+      return as<std::int32_t>(mkn::kul::env::GET(ENV));
+    }
+    return dev.maxThreadsPerBlock;
+  }
+
+  template <typename T>
+  auto static as(std::string const& from) {
+    T t;
+    std::stringstream ss(from);
+    ss >> t;
+    return t;
+  }
+
+  Device const& dev;
+};
+
+} /* namespace mkn::gpu */
+
+#endif /*_MKN_GPU_CLI_HPP_*/
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index 8b34013..36de6f8 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/assert.hpp"
 #include "mkn/kul/threads.hpp"
 
+#include "mkn/gpu/cli.hpp"
 #include "mkn/gpu/def.hpp"
 
 #include <cassert>
@@ -82,6 +83,15 @@ struct dim3 {
   std::size_t x = 1, y = 1, z = 1;
 };
 
+void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {
+  // noop
+}
+
+auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
+  int supportsCoopLaunch = 0;
+  return supportsCoopLaunch;
+}
+
 struct Stream {
   Stream() {}
   ~Stream() {}
@@ -93,6 +103,19 @@ struct Stream {
   std::size_t stream = 0;
 };
 
+struct StreamEvent {
+  StreamEvent(Stream&) {}
+  ~StreamEvent() {}
+
+  auto& operator()() { return event; };
+  void record() { ; }
+  bool finished() const { return true; }
+  void reset() {}
+
+  Stream stream;
+  std::size_t event = 0;
+};
+
 template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} {}
@@ -129,7 +152,7 @@ void alloc_managed(T*& p, Size size) {
   MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
 }
 
-void destroy(void* p) {
+void inline destroy(void* p) {
   KLOG(TRC);
   std::free(p);
 }
@@ -146,6 +169,12 @@ void destroy_host(T*& p) {
   std::free(p);
 }
 
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T)));
+}
+
 template <typename Size>
 void send(void* p, void* t, Size size = 1) {
   KLOG(TRC);
@@ -177,7 +206,7 @@ void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) {
   take(p, span.data(), span.size(), start);
 }
 
-void sync() {}
+void inline sync() {}
 
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
@@ -186,7 +215,7 @@ namespace detail {
 static thread_local std::size_t idx = 0;
 }
 
-template <typename F, typename... Args>
+template <bool _sync = true, bool _coop = false, typename F, typename... Args>
 void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
@@ -252,6 +281,8 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) {
 
 #include "launchers.hpp"
 
+void grid_sync() {}
+
 } /* namespace MKN_GPU_NS */
 
 #undef MKN_GPU_ASSERT
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index edd7794..dd96528 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -34,13 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <vector>
 
-#include <cuda_runtime.h>
-
 #include "mkn/kul/log.hpp"
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
+#include "mkn/gpu/cli.hpp"
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
 #include "mkn/gpu/def.hpp"
 
 // #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess))
@@ -50,10 +51,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
   if (code != cudaSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-    if (abort) exit(code);
+    if (abort) std::abort();
   }
 }
 
+namespace mkn::gpu::cuda {
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE idx() {
+  SIZE width = gridDim.x * blockDim.x;
+  SIZE height = gridDim.y * blockDim.y;
+  SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
+  SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
+  SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
+  return x + (y * width) + (z * width * height);
+}
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE block_idx_x() {
+  return blockIdx.x;
+}
+
+}  // namespace mkn::gpu::cuda
+
+//
+
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::cuda
 #else
@@ -62,6 +84,16 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort =
 
 namespace MKN_GPU_NS {
 
+void setLimitMallocHeapSize(std::size_t const& bytes) {
+  MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
+}
+
+auto supportsCooperativeLaunch(int const dev = 0) {
+  int supportsCoopLaunch = 0;
+  MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
+  return supportsCoopLaunch;
+}
+
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
   ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); }
@@ -74,6 +106,23 @@ struct Stream {
   cudaStream_t stream;
 };
 
+struct StreamEvent {
+  StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
+  ~StreamEvent() { /*MKN_GPU_ASSERT(result = cudaEventDestroy(event));*/ }
+
+  auto& operator()() { return event; };
+  void record() { MKN_GPU_ASSERT(result = cudaEventRecord(event, stream())); }
+  bool finished() const { return cudaEventQuery(event) == cudaSuccess; }
+  void reset() {
+    if (event) MKN_GPU_ASSERT(result = cudaEventDestroy(event));
+    MKN_GPU_ASSERT(result = cudaEventCreate(&event));
+  }
+
+  Stream& stream;
+  cudaError_t result;
+  cudaEvent_t event = nullptr;
+};
+
 template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
@@ -112,7 +161,7 @@ void alloc_managed(T*& p, Size size) {
   MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T)));
 }
 
-void destroy(void* p) {
+void inline destroy(void* p) {
   KLOG(TRC);
   MKN_GPU_ASSERT(cudaFree(p));
 }
@@ -129,6 +178,12 @@ void destroy_host(T*& ptr) {
   MKN_GPU_ASSERT(cudaFreeHost(ptr));
 }
 
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice));
+}
+
 template <typename Size>
 void send(void* p, void* t, Size size = 1) {
   KLOG(TRC);
@@ -147,23 +202,6 @@ void take(T const* p, T* t, Size size = 1, Size start = 0) {
   MKN_GPU_ASSERT(cudaMemcpy(t, p + start, size * sizeof(T), cudaMemcpyDeviceToHost));
 }
 
-template <typename T, typename Size>
-void copy(T* dst, T const* src, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  Pointer p{dst};
-  if (p.is_host_ptr())
-    take(src, dst, size, start);
-  else
-    send(dst, src, size, start);
-}
-
-template <typename Type, typename Alloc0, typename Alloc1>
-void copy(std::vector<Type, Alloc0>& dst, std::vector<Type, Alloc1> const& src) {
-  KLOG(TRC);
-  assert(dst.size() >= src.size());
-  copy(dst.data(), src.data(), dst.size());
-}
-
 template <typename T, typename Size>
 void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) {
   KLOG(TRC);
@@ -185,19 +223,34 @@ void take_async(T* p, Span& span, Stream& stream, std::size_t start) {
                                  stream()));
 }
 
-void sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
+void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
+void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); }
 
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
 
-template <typename F, typename... Args>
+template <bool _sync = true, bool _coop = false, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
-      [&](auto&&... params) { f<<<g, b, ds, s>>>(params...); },
+      [&](auto&&... params) {
+        if constexpr (_coop) {
+          auto address_of = [](auto& a) { return (void*)&a; };
+          void* kernelArgs[] = {(address_of(params), ...)};
+          cudaLaunchCooperativeKernel((void*)f, g, b, kernelArgs, ds);
+        } else {
+          f<<<g, b, ds, s>>>(params...);
+        }
+        MKN_GPU_ASSERT(cudaGetLastError());
+      },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-  sync();
+  if constexpr (_sync) {
+    if (s)
+      sync(s);
+    else
+      sync();
+  }
 }
 
 //
@@ -219,16 +272,23 @@ struct Launcher {
 };
 
 struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
-    [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
+  GLauncher(std::size_t const& s, std::size_t const& _dev = 0)
+      : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} {
+    MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
 
-    b.x = devProp.maxThreadsPerBlock;
+    resize(s);
+  }
+
+  void resize(std::size_t const& s, std::size_t const& bx = 0) {
+    b.x = bx > 0 ? bx : cli.bx_threads();
     g.x = s / b.x;
     if ((s % b.x) > 0) ++g.x;
   }
 
+  std::size_t dev = 0;
   std::size_t count = 0;
   cudaDeviceProp devProp;
+  mkn::gpu::Cli<cudaDeviceProp> cli{devProp};
 };
 
 template <typename F, typename... Args>
@@ -254,7 +314,7 @@ void fill(Container& c, T val) {
 }
 
 //
-void prinfo(size_t dev = 0) {
+void inline prinfo(size_t dev = 0) {
   cudaDeviceProp devProp;
   [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
   KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
@@ -267,6 +327,13 @@ void prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
+__device__ void grid_sync() {
+  namespace cg = cooperative_groups;
+  cg::grid_group grid = cg::this_grid();
+  assert(grid.is_valid());
+  grid.sync();
+}
+
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp
deleted file mode 100644
index 37abb46..0000000
--- a/inc/mkn/gpu/cuda/def.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-
-// IWYU pragma: private, include "mkn/gpu/def.hpp"
-
-#ifndef _MKN_GPU_CUDA_DEF_HPP_
-#define _MKN_GPU_CUDA_DEF_HPP_
-
-#include <cuda_runtime.h>
-
-namespace mkn::gpu::cuda {
-
-template <typename SIZE = uint32_t /*max 4294967296*/>
-__device__ SIZE idx() {
-  SIZE width = gridDim.x * blockDim.x;
-  SIZE height = gridDim.y * blockDim.y;
-
-  SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
-  SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
-  SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
-  return x + (y * width) + (z * width * height);
-}
-
-}  // namespace mkn::gpu::cuda
-
-#endif /*_MKN_GPU_CUDA_DEF_HPP_*/
diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp
index da3a26c..d25689a 100644
--- a/inc/mkn/gpu/def.hpp
+++ b/inc/mkn/gpu/def.hpp
@@ -1,20 +1,8 @@
-
-
 #ifndef _MKN_GPU_DEF_HPP_
 #define _MKN_GPU_DEF_HPP_
 
 #include <type_traits>
 
-#if defined(MKN_GPU_ROCM)
-#include "mkn/gpu/rocm/def.hpp"
-#elif defined(MKN_GPU_CUDA)
-#include "mkn/gpu/cuda/def.hpp"
-#elif defined(MKN_GPU_CPU)
-
-#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0
-#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA"
-#endif
-
 namespace mkn::gpu {
 
 #if defined(MKN_GPU_CPU)
diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp
new file mode 100644
index 0000000..6f98830
--- /dev/null
+++ b/inc/mkn/gpu/defines.hpp
@@ -0,0 +1,53 @@
+
+
+#ifndef _MKN_GPU_DEFINES_HPP_
+#define _MKN_GPU_DEFINES_HPP_
+
+#include <type_traits>
+
+#if !defined(MKN_GPU_FN_PER_NS)
+#define MKN_GPU_FN_PER_NS 0
+#endif
+
+#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
+#define MKN_GPU_ROCM 1
+#endif
+#if !defined(MKN_GPU_ROCM)
+#define MKN_GPU_ROCM 0
+#endif
+
+#if !defined(MKN_GPU_CUDA) and __has_include(<cuda_runtime.h>)
+#define MKN_GPU_CUDA 1
+#endif
+#if !defined(MKN_GPU_CUDA)
+#define MKN_GPU_CUDA 0
+#endif
+
+#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0
+#define MKN_GPU_FN_PER_NS 1
+#endif
+
+#if MKN_GPU_ROCM == 1
+#include "mkn/gpu/rocm.hpp"
+#endif
+
+#if MKN_GPU_CUDA
+#include "mkn/gpu/cuda.hpp"
+#endif
+
+#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
+#include "mkn/gpu/cpu.hpp"
+#endif
+
+namespace mkn::gpu {
+
+struct CompileFlags {
+  bool constexpr static withCUDA = MKN_GPU_CUDA;
+  bool constexpr static withROCM = MKN_GPU_ROCM;
+  bool constexpr static withCPU = !MKN_GPU_ROCM and !MKN_GPU_CUDA;
+  bool constexpr static perNamespace = MKN_GPU_FN_PER_NS;
+};
+
+} /* namespace mkn::gpu */
+
+#endif /*_MKN_GPU_DEFINES_HPP_*/
diff --git a/inc/mkn/gpu/device.hpp b/inc/mkn/gpu/device.hpp
index 7580180..04cc9e8 100644
--- a/inc/mkn/gpu/device.hpp
+++ b/inc/mkn/gpu/device.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -100,6 +100,9 @@ struct DeviceMem {
 
   auto& size() const { return s; }
 
+  auto* data() { return p; }
+  auto* data() const { return p; }
+
   std::size_t s = 0;
   T* p = nullptr;
   bool owned = false;
diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
index adbd213..67bd510 100644
--- a/inc/mkn/gpu/launchers.hpp
+++ b/inc/mkn/gpu/launchers.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_LAUNCHERS_HPP_
 #define _MKN_GPU_LAUNCHERS_HPP_
 
+template <bool _sync = true, bool _coop = false>
 struct GDLauncher : public GLauncher {
   GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {
-    _launch(std::forward<F>(f),
+    _launch(s, std::forward<F>(f),
+            as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
+            count, args...);
+  }
+
+  template <typename F, typename... Args>
+  auto stream(Stream& s, F&& f, Args&&... args) {
+    _launch(s.stream, std::forward<F>(f),
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
             count, args...);
   }
@@ -48,9 +56,9 @@ struct GDLauncher : public GLauncher {
     return T{nullptr};
   }
 
-  template <typename F, typename... PArgs, typename... Args>
-  void _launch(F&& f, std::tuple<PArgs&...>*, Args&&... args) {
-    MKN_GPU_NS::launch(&global_gd_kernel<F, PArgs...>, g, b, ds, s, f, args...);
+  template <typename S, typename F, typename... PArgs, typename... Args>
+  void _launch(S& _s, F&& f, std::tuple<PArgs&...>*, Args&&... args) {
+    MKN_GPU_NS::launch<_sync, _coop>(&global_gd_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
   }
 };
 
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
new file mode 100644
index 0000000..f50fac4
--- /dev/null
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -0,0 +1,160 @@
+/**
+Copyright (c) 2024, Philip Deegan.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Philip Deegan nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _MKN_GPU_MULTI_LAUNCH_HPP_
+#define _MKN_GPU_MULTI_LAUNCH_HPP_
+
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <thread>
+#include <algorithm>
+
+#include "mkn/gpu.hpp"
+
+namespace mkn::gpu {
+
+enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT };
+
+template <typename Strat>
+struct StreamFunction {
+  StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {}
+  virtual ~StreamFunction() {}
+  virtual void run(std::uint32_t const) {};
+
+  Strat& strat;
+  StreamFunctionMode mode;
+};
+
+template <typename Strat, typename Fn>
+struct StreamDeviceFunction : StreamFunction<Strat> {
+  using Super = StreamFunction<Strat>;
+  using Super::strat;
+
+  StreamDeviceFunction(Strat& strat, Fn&& fn_)
+      : Super{strat, StreamFunctionMode::DEVICE_WAIT}, fn{fn_} {}
+  void run(std::uint32_t const i) override {
+    mkn::gpu::GDLauncher<false>{strat.datas[i].size()}.stream(
+        strat.streams[i], [=, fn = fn] __device__() mutable { fn(i); });
+  }
+
+  Fn fn;
+};
+
+template <typename Strat, typename Fn>
+struct StreamHostFunction : StreamFunction<Strat> {
+  using Super = StreamFunction<Strat>;
+  StreamHostFunction(Strat& strat, Fn&& fn_)
+      : Super{strat, StreamFunctionMode::HOST_WAIT}, fn{fn_} {}
+  void run(std::uint32_t const i) override { fn(i); }
+  Fn fn;
+};
+
+template <typename Datas>
+struct StreamLauncher {
+  using This = StreamLauncher<Datas>;
+  using T = typename Datas::value_type::value_type;
+
+  StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) {
+    for (auto& s : streams) events.emplace_back(s);
+  }
+
+  ~StreamLauncher() { sync(); }
+
+  void sync() noexcept {
+    for (auto& s : streams) s.sync();
+  }
+
+  template <typename Fn>
+  auto& dev(Fn&& fn) {
+    fns.emplace_back(std::make_shared<StreamDeviceFunction<This, Fn>>(self, std::forward<Fn>(fn)));
+    return self;
+  }
+  template <typename Fn>
+  auto& host(Fn&& fn) {
+    fns.emplace_back(std::make_shared<StreamHostFunction<This, Fn>>(self, std::forward<Fn>(fn)));
+    return self;
+  }
+
+  void operator()() {
+    using namespace std::chrono_literals;
+
+    if (fns.size() == 0) return;
+
+    for (std::size_t i = 0; i < datas.size(); ++i) self(i);
+
+    do {
+      for (std::size_t i = 0; i < datas.size(); ++i) {
+        if (is_finished(i)) continue;
+        if (is_fn_finished(i)) {
+          data_step[i] += 1;
+          if (not is_finished(i)) self(i);
+        }
+      }
+      std::this_thread::sleep_for(1ms);  // make sleep time configurable
+    } while (!is_finished());
+  }
+
+  void operator()(std::uint32_t const i) {
+    auto const& step = data_step[i];
+    fns[step]->run(i);
+    if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record();
+  }
+
+  bool is_finished() const {
+    std::uint32_t finished = 0;
+    for (std::size_t i = 0; i < datas.size(); ++i)
+      if (is_finished(i)) ++finished;
+    return finished == datas.size();
+  }
+
+  bool is_finished(std::uint32_t idx) const { return data_step[idx] == fns.size(); }
+
+  bool is_fn_finished(std::uint32_t i) {
+    auto const b = [&]() {
+      auto const& step = data_step[i];
+      if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true;
+      return events[i].finished();
+    }();
+    if (b) events[i].reset();
+    return b;
+  }
+
+  Datas& datas;
+  std::vector<std::shared_ptr<StreamFunction<This>>> fns;
+  std::vector<mkn::gpu::Stream> streams;
+  std::vector<mkn::gpu::StreamEvent> events;
+  std::vector<std::uint16_t> data_step;
+  This& self = *this;
+};
+
+}  // namespace mkn::gpu
+
+#endif /* _MKN_GPU_MULTI_LAUNCH_HPP_ */
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index a09b5a6..6ca519c 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2020, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -32,15 +32,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_ROCM_HPP_
 #define _MKN_GPU_ROCM_HPP_
 
-#include "hip/hip_runtime.h"
-
 #include "mkn/kul/log.hpp"
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
+#include "mkn/gpu/cli.hpp"
+#include "hip/hip_runtime.h"
 #include "mkn/gpu/def.hpp"
 
+#include "hip/hip_cooperative_groups.h"
+
 // #define MKN_GPU_ASSERT(x) (KASSERT((x) == hipSuccess))
 
 #define MKN_GPU_ASSERT(ans) \
@@ -48,10 +50,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) {
   if (code != hipSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
-    if (abort) exit(code);
+    if (abort) std::abort();
   }
 }
 
+namespace mkn::gpu::hip {
+
+template <typename SIZE = uint32_t /*max 4294967296*/>
+__device__ SIZE idx() {
+  SIZE width = hipGridDim_x * hipBlockDim_x;
+  SIZE height = hipGridDim_y * hipBlockDim_y;
+
+  SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+  SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+  return x + (y * width) + (z * width * height);  // max 4294967296
+}
+
+}  // namespace mkn::gpu::hip
+
+//
+
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::hip
 #else
@@ -60,6 +79,17 @@ inline void gpuAssert(hipError_t code, const char* file, int line, bool abort =
 
 namespace MKN_GPU_NS {
 
+void setLimitMallocHeapSize(std::size_t const& bytes) {
+  MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
+}
+
+auto supportsCooperativeLaunch(int const dev = 0) {
+  int supportsCoopLaunch = 0;
+  MKN_GPU_ASSERT(
+      hipDeviceGetAttribute(&supportsCoopLaunch, hipDeviceAttributeCooperativeLaunch, dev));
+  return supportsCoopLaunch;
+}
+
 struct Stream {
   Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); }
   ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); }
@@ -72,17 +102,48 @@ struct Stream {
   hipStream_t stream;
 };
 
+struct StreamEvent {
+  StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
+  ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/ }
+
+  auto& operator()() { return event; };
+  void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); }
+  bool finished() const { return hipEventQuery(event) == hipSuccess; }
+  void reset() {
+    if (event) MKN_GPU_ASSERT(result = hipEventDestroy(event));
+    MKN_GPU_ASSERT(result = hipEventCreate(&event));
+  }
+
+  Stream& stream;
+  hipError_t result;
+  hipEvent_t event = nullptr;
+};
+
+// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3
 template <typename T>
 struct Pointer {
-  Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); }
+  Pointer(T* _t) : t{_t} {
+    assert(t);
+    MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t));
+    type = attributes.type;
+  }
 
-  // bool is_unregistered_ptr() const { return attributes.type == 0; }
-  bool is_host_ptr() const { return attributes.hostPointer != nullptr; }
-  bool is_device_ptr() const { return attributes.devicePointer != nullptr; }
-  bool is_managed_ptr() const { return attributes.isManaged; }
+  bool is_unregistered_ptr() const {
+    return attributes.type == hipMemoryType::hipMemoryTypeUnregistered;
+  }
+  bool is_host_ptr() const {
+    return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost;
+  }
+  bool is_device_ptr() const {
+    return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged;
+  }
+  bool is_managed_ptr() const {
+    return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified;
+  }
 
   T* t;
   hipPointerAttribute_t attributes;
+  hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered;
 };
 
 template <typename Size>
@@ -109,7 +170,7 @@ void alloc_managed(T*& p, Size size) {
   MKN_GPU_ASSERT(hipMallocManaged((void**)&p, size * sizeof(T)));
 }
 
-void destroy(void* p) {
+void inline destroy(void* p) {
   KLOG(TRC);
   MKN_GPU_ASSERT(hipFree(p));
 }
@@ -126,6 +187,12 @@ void destroy_host(T* ptr) {
   MKN_GPU_ASSERT(hipHostFree(ptr));
 }
 
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice));
+}
+
 template <typename Size>
 void send(void* p, void* t, Size size = 1) {
   KLOG(TRC);
@@ -165,19 +232,34 @@ void take_async(T* p, Span& span, Stream& stream, std::size_t start) {
                                 stream()));
 }
 
-void sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); }
+void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); }
+void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); }
 
 #include "mkn/gpu/alloc.hpp"
 #include "mkn/gpu/device.hpp"
 
-template <typename F, typename... Args>
+template <bool _sync = true, bool _coop = false, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
   KLOG(TRC) << N;
   std::apply(
-      [&](auto&&... params) { hipLaunchKernelGGL(f, g, b, ds, s, params...); },
+      [&](auto&&... params) {
+        if constexpr (_coop) {
+          auto address_of = [](auto& a) { return (void*)&a; };
+          void* kernelArgs[] = {(address_of(params), ...)};
+          hipLaunchCooperativeKernel<F>(f, g, b, kernelArgs, ds, s);
+        } else {
+          hipLaunchKernelGGL(f, g, b, ds, s, params...);
+        }
+        MKN_GPU_ASSERT(hipGetLastError());
+      },
       devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-  sync();
+  if constexpr (_sync) {
+    if (s)
+      sync(s);
+    else
+      sync();
+  }
 }
 
 // https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions
@@ -202,13 +284,14 @@ struct GLauncher : public Launcher {
   GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
     [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
 
-    b.x = devProp.maxThreadsPerBlock;
+    b.x = cli.bx_threads();
     g.x = s / b.x;
     if ((s % b.x) > 0) ++g.x;
   }
 
   std::size_t count = 0;
   hipDeviceProp_t devProp;
+  mkn::gpu::Cli<hipDeviceProp_t> cli{devProp};
 };
 
 template <typename F, typename... Args>
@@ -234,7 +317,7 @@ void fill(Container& c, T val) {
 }
 
 // https://rocm-developer-tools.github.io/HIP/group__Device.html
-void prinfo(size_t dev = 0) {
+void inline prinfo(size_t dev = 0) {
   hipDeviceProp_t devProp;
   [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
   KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
@@ -247,6 +330,13 @@ void prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
+__device__ void grid_sync() {
+  namespace cg = cooperative_groups;
+  cg::grid_group grid = cg::this_grid();
+  assert(grid.is_valid());
+  grid.sync();
+}
+
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp
deleted file mode 100644
index d556972..0000000
--- a/inc/mkn/gpu/rocm/def.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-
-// IWYU pragma: private, include "mkn/gpu/def.hpp"
-
-#ifndef _MKN_GPU_ROCM_DEF_HPP_
-#define _MKN_GPU_ROCM_DEF_HPP_
-
-#include "hip/hip_runtime.h"
-
-namespace mkn::gpu::hip {
-
-template <typename SIZE = uint32_t /*max 4294967296*/>
-__device__ SIZE idx() {
-  SIZE width = hipGridDim_x * hipBlockDim_x;
-  SIZE height = hipGridDim_y * hipBlockDim_y;
-
-  SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
-  SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
-  return x + (y * width) + (z * width * height);  // max 4294967296
-}
-
-}  // namespace mkn::gpu::hip
-
-#endif /*_MKN_GPU_ROCM_DEF_HPP_*/
diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp
index 704e77e..383a2b9 100644
--- a/inc/mkn/gpu/tuple.hpp
+++ b/inc/mkn/gpu/tuple.hpp
@@ -1,5 +1,5 @@
 /**
-Copyright (c) 2017, Philip Deegan.
+Copyright (c) 2024, Philip Deegan.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/mkn.yaml b/mkn.yaml
index 2bf616c..2d7b461 100644
--- a/mkn.yaml
+++ b/mkn.yaml
@@ -1,4 +1,4 @@
-#! clean build test run -tOp rocm -x res/mkn/hipcc
+#! clean build test run -Op rocm -x res/mkn/hipcc -W
 
 name: mkn.gpu
 parent: headers
@@ -10,13 +10,13 @@ profile:
 
 - name: rocm
   parent: headers
-  arg: -DMKN_GPU_ROCM
+  # arg: -DMKN_GPU_ROCM=1
   test: test/any/(\w).cpp
         test/hip/(\w).cpp
 
 - name: cuda
   parent: headers
-  arg: -DMKN_GPU_CUDA
+  # arg: -DMKN_GPU_CUDA
   test: test/any/(\w).cpp
         test/cuda/(\w).cpp
 
diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml
index 8c6d86d..638b4c6 100644
--- a/res/mkn/clang_cuda.yaml
+++ b/res/mkn/clang_cuda.yaml
@@ -5,6 +5,11 @@
 ## The following compile flags will likely be required with their specific values for your current hardware
 # --cuda-gpu-arch="sm_61"  # sm_61 = nvidia 1080 (
 #  see: http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards
+#
+# Common issues:
+#   GPUassert: unknown error
+#     reboot or try `sudo modprobe nvidia_uvm`
+#
 
 property:
   cxx_flags: --std=c++17 -fPIC -fsized-deallocation -Wno-unknown-cuda-version
diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml
index d0dc3f7..3cb9a0a 100644
--- a/res/mkn/hipcc.yaml
+++ b/res/mkn/hipcc.yaml
@@ -1,8 +1,8 @@
 ## Recommended settings commented out.
 
-# local:
-#   repo: /mkn/r
-#   mod-repo: /mkn/m
+local:
+  repo: /mkn/r
+  mod-repo: /mkn/m
 
 # remote:
 #   repo: git@github.com:mkn/
diff --git a/test/any/add.cpp b/test/any/add.cpp
index 860d7f0..9a21b08 100644
--- a/test/any/add.cpp
+++ b/test/any/add.cpp
@@ -30,10 +30,20 @@ __global__ void vectoradd1(T* a, T* b) {
 template <typename Float>
 uint32_t test_add1() {
   std::vector<Float> b(NUM);
+
+  assert(mkn::gpu::Pointer{b.data()}.is_host_ptr());
+
   for (uint32_t i = 0; i < NUM; i++) b[i] = i;
   mkn::gpu::DeviceMem<Float> devA(NUM), devB(b);
+
+  if constexpr (!mkn::gpu::CompileFlags::withCPU) {
+    assert(mkn::gpu::Pointer{devA.p}.is_device_ptr());
+  }
+
   mkn::gpu::Launcher{WIDTH, HEIGHT, TPB_X, TPB_Y}(vectoradd1<Float>, devA, devB);
   auto a = devA();
+
+  // assert(mkn::gpu::Pointer{a.data()}.is_device_ptr());
   for (uint32_t i = 0; i < NUM; i++)
     if (a[i] != b[i] + 1) return 1;
   return 0;
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
new file mode 100644
index 0000000..b24ca81
--- /dev/null
+++ b/test/any/async_streaming.cpp
@@ -0,0 +1,45 @@
+
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <thread>
+#include <algorithm>
+
+#include "mkn/gpu/multi_launch.hpp"
+
+std::uint32_t static constexpr NUM = 128 * 1024 * 1024;  // ~ 1GB of doubles
+std::size_t constexpr static C = 5;                      // ~ 5GB of doubles
+
+template <typename T>
+using ManagedVector = std::vector<T, mkn::gpu::ManagedAllocator<T>>;
+
+struct A {
+  std::uint32_t i0;
+};
+
+std::uint32_t test() {
+  using namespace mkn::gpu;
+  using T = double;
+  std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
+  for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
+
+  ManagedVector<T*> datas(C);
+  for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
+  auto views = datas.data();
+
+  StreamLauncher{vecs}
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
+      .host([&](auto i) mutable { vecs[i][0] += 1; })
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })  //
+      ();
+
+  for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl;
+
+  return 0;
+}
+
+int main() {
+  KOUT(NON) << __FILE__;
+
+  return test();
+}
diff --git a/test/any/construct.cpp b/test/any/construct.cpp
new file mode 100644
index 0000000..188b729
--- /dev/null
+++ b/test/any/construct.cpp
@@ -0,0 +1,59 @@
+
+#include "mkn/gpu.hpp"
+
+static constexpr uint32_t NUM = 5;
+
+template <typename T>
+using ManagedVector = std::vector<T, mkn::gpu::ManagedAllocator<T>>;
+
+template <typename T>
+using ManagedMemory = std::vector<T, mkn::gpu::NoConstructAllocator<T>>;
+
+std::size_t alloced = 0;
+struct S {
+  S() { ++alloced; }
+
+  std::uint16_t s = 1;
+};
+
+std::uint32_t test_does_construct_on_resize() {
+  KLOG(INF);
+  alloced = 0;
+  ManagedVector<S> mem{NUM};
+  mem.resize(NUM + NUM);
+  return alloced != NUM + NUM;
+}
+
+std::uint32_t test_does_not_construct_on_resize() {
+  KLOG(INF);
+  alloced = 0;
+  ManagedMemory<S> mem{NUM};
+  assert(mem.size() == 5 && "wrong size");
+  resize(mem, NUM + NUM);
+  assert(mem.size() == 10 && "wrong size");
+
+  auto cap = mem.capacity();
+
+  KLOG(INF) << mem.capacity();
+  as_super(mem).emplace_back();  // does construct
+  KLOG(INF) << mem.capacity();
+  assert(mem.capacity() != cap && "capacity bad!");
+  assert(mem[10].s == 1 && "bad construct!");
+
+  cap = mem.capacity();
+
+  resize(mem, NUM * 5);
+  assert(mem.size() == 25 && "wrong size");
+  assert(mem.capacity() != cap && "capacity bad!");
+  assert(mem[10].s == 1 && "bad copy!");
+
+  KLOG(INF) << mem[10].s;
+  KLOG(INF) << mem[11].s;
+  return alloced != 1 or mem[10].s != 1;
+}
+
+int main() {
+  KOUT(NON) << __FILE__;
+
+  return test_does_construct_on_resize() + test_does_not_construct_on_resize();
+}
diff --git a/test/any/coop.cpp b/test/any/coop.cpp
new file mode 100644
index 0000000..9e0d473
--- /dev/null
+++ b/test/any/coop.cpp
@@ -0,0 +1,41 @@
+
+#include "mkn/gpu.hpp"
+
+static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024;
+static constexpr uint32_t NUM = WIDTH * HEIGHT;
+
+template <typename T>
+using ManagedVector = std::vector<T, mkn::gpu::ManagedAllocator<T>>;
+
+struct S {
+  float f0 = 1;
+  double d0 = 1;
+};
+
+std::uint32_t test_lambda_copy_capture_views() {
+  mkn::gpu::GDLauncher<true, true> launcher{NUM};
+
+  ManagedVector<S> mem{NUM};
+  for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i;
+
+  auto* view = mem.data();
+
+  launcher([=] __device__() {
+    auto i = mkn::gpu::idx();
+    mkn::gpu::grid_sync();
+    view[i].f0 = view[i].d0 + 1;
+  });
+
+  for (std::uint32_t i = 0; i < NUM; ++i)
+    if (view[i].f0 != view[i].d0 + 1) return 1;
+
+  return 0;
+}
+
+int main() {
+  KOUT(NON) << __FILE__;
+
+  // if (mkn::gpu::supportsCooperativeLaunch()) return test_lambda_copy_capture_views();
+  // KOUT(NON) << "Cooperative Launch not supported";
+  return 0;
+}
diff --git a/test/any/managed.cpp b/test/any/managed.cpp
index 158e960..c563885 100644
--- a/test/any/managed.cpp
+++ b/test/any/managed.cpp
@@ -21,6 +21,10 @@ __global__ void kernel(S* structs) {
 template <typename L>
 std::uint32_t _test(L&& launcher) {
   ManagedVector<S> mem{NUM};
+  if constexpr (!mkn::gpu::CompileFlags::withCPU) {
+    assert(mkn::gpu::Pointer{mem.data()}.is_managed_ptr());
+  }
+
   for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i;
 
   launcher(kernel, mem);

From c0fb26128b07b13716660e97be15af0c9977e2ac Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Sun, 14 Jul 2024 13:27:10 +0200
Subject: [PATCH 2/7] allow ref lambdas

---
 inc/mkn/gpu/launchers.hpp |  4 ++--
 test/any/managed.cpp      | 28 +++++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
index 67bd510..8cbe770 100644
--- a/inc/mkn/gpu/launchers.hpp
+++ b/inc/mkn/gpu/launchers.hpp
@@ -37,7 +37,7 @@ struct GDLauncher : public GLauncher {
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {
-    _launch(s, std::forward<F>(f),
+    _launch(s, f,
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
             count, args...);
   }
@@ -57,7 +57,7 @@ struct GDLauncher : public GLauncher {
   }
 
   template <typename S, typename F, typename... PArgs, typename... Args>
-  void _launch(S& _s, F&& f, std::tuple<PArgs&...>*, Args&&... args) {
+  void _launch(S& _s, F& f, std::tuple<PArgs&...>*, Args&&... args) {
     MKN_GPU_NS::launch<_sync, _coop>(&global_gd_kernel<F, PArgs...>, g, b, ds, _s, f, args...);
   }
 };
diff --git a/test/any/managed.cpp b/test/any/managed.cpp
index c563885..0c9634b 100644
--- a/test/any/managed.cpp
+++ b/test/any/managed.cpp
@@ -62,9 +62,31 @@ std::uint32_t test_lambda_copy_capture_views() {
   return _test_lambda_copy_capture_views(mkn::gpu::GDLauncher{NUM});
 }
 
+std::uint32_t test_lambda_ref_copy_capture_views() {
+  mkn::gpu::GDLauncher launcher{NUM};
+
+  ManagedVector<S> mem{NUM};
+  for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i;
+
+  auto* view = mem.data();
+
+  auto fn = [=] __device__() {
+    auto i = mkn::gpu::idx();
+    view[i].f0 = view[i].d0 + 1;
+  };
+
+  launcher(fn);
+
+  for (std::uint32_t i = 0; i < NUM; ++i)
+    if (view[i].f0 != view[i].d0 + 1) return 1;
+
+  return 0;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
-  return test() +        //
-         test_guess() +  //
-         test_lambda_copy_capture_views();
+  return test() +                            //
+         test_guess() +                      //
+         test_lambda_copy_capture_views() +  //
+         test_lambda_ref_copy_capture_views();
 }

From 923b6e5770068dd0d6a37defc40293dd235d9276 Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Wed, 17 Jul 2024 17:59:15 +0200
Subject: [PATCH 3/7] up

---
 inc/mkn/gpu/launchers.hpp    | 2 +-
 inc/mkn/gpu/multi_launch.hpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
index 8cbe770..c17cb56 100644
--- a/inc/mkn/gpu/launchers.hpp
+++ b/inc/mkn/gpu/launchers.hpp
@@ -44,7 +44,7 @@ struct GDLauncher : public GLauncher {
 
   template <typename F, typename... Args>
   auto stream(Stream& s, F&& f, Args&&... args) {
-    _launch(s.stream, std::forward<F>(f),
+    _launch(s.stream, f,
             as_values(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()),
             count, args...);
   }
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index f50fac4..b0491d4 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -47,7 +47,7 @@ template <typename Strat>
 struct StreamFunction {
   StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {}
   virtual ~StreamFunction() {}
-  virtual void run(std::uint32_t const) {};
+  virtual void run(std::uint32_t const) {}
 
   Strat& strat;
   StreamFunctionMode mode;
@@ -80,7 +80,6 @@ struct StreamHostFunction : StreamFunction<Strat> {
 template <typename Datas>
 struct StreamLauncher {
   using This = StreamLauncher<Datas>;
-  using T = typename Datas::value_type::value_type;
 
   StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) {
     for (auto& s : streams) events.emplace_back(s);

From 583105120692d586d8f2e2501d4b3d98cefcb7e3 Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Thu, 25 Jul 2024 16:58:58 +0200
Subject: [PATCH 4/7] threaded stream launcher

---
 inc/mkn/gpu/multi_launch.hpp | 166 ++++++++++++++++++++++++++++++++---
 test/any/async_streaming.cpp |  60 ++++++++++++-
 2 files changed, 212 insertions(+), 14 deletions(-)

diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index b0491d4..66626b8 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -34,7 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cassert>
 #include <chrono>
 #include <iostream>
+#include <cstdint>
+#include <vector>
 #include <thread>
+#include <mutex>
 #include <algorithm>
 
 #include "mkn/gpu.hpp"
@@ -42,12 +45,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace mkn::gpu {
 
 enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT };
+enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY };
 
 template <typename Strat>
 struct StreamFunction {
   StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {}
   virtual ~StreamFunction() {}
-  virtual void run(std::uint32_t const) {}
+  virtual void run(std::uint32_t const) = 0;
 
   Strat& strat;
   StreamFunctionMode mode;
@@ -61,6 +65,8 @@ struct StreamDeviceFunction : StreamFunction<Strat> {
   StreamDeviceFunction(Strat& strat, Fn&& fn_)
       : Super{strat, StreamFunctionMode::DEVICE_WAIT}, fn{fn_} {}
   void run(std::uint32_t const i) override {
+    strat.events[i].record();
+
     mkn::gpu::GDLauncher<false>{strat.datas[i].size()}.stream(
         strat.streams[i], [=, fn = fn] __device__() mutable { fn(i); });
   }
@@ -77,9 +83,10 @@ struct StreamHostFunction : StreamFunction<Strat> {
   Fn fn;
 };
 
-template <typename Datas>
+template <typename Datas, typename Self_ = void>
 struct StreamLauncher {
-  using This = StreamLauncher<Datas>;
+  using This = StreamLauncher<Datas, Self_>;
+  using Self = std::conditional_t<std::is_same_v<Self_, void>, This, Self_>;
 
   StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) {
     for (auto& s : streams) events.emplace_back(s);
@@ -92,13 +99,13 @@ struct StreamLauncher {
   }
 
   template <typename Fn>
-  auto& dev(Fn&& fn) {
-    fns.emplace_back(std::make_shared<StreamDeviceFunction<This, Fn>>(self, std::forward<Fn>(fn)));
+  Self& dev(Fn&& fn) {
+    fns.emplace_back(std::make_shared<StreamDeviceFunction<Self, Fn>>(self, std::forward<Fn>(fn)));
     return self;
   }
   template <typename Fn>
-  auto& host(Fn&& fn) {
-    fns.emplace_back(std::make_shared<StreamHostFunction<This, Fn>>(self, std::forward<Fn>(fn)));
+  Self& host(Fn&& fn) {
+    fns.emplace_back(std::make_shared<StreamHostFunction<Self, Fn>>(self, std::forward<Fn>(fn)));
     return self;
   }
 
@@ -123,6 +130,7 @@ struct StreamLauncher {
 
   void operator()(std::uint32_t const i) {
     auto const& step = data_step[i];
+    assert(step < fns.size());
     fns[step]->run(i);
     if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record();
   }
@@ -142,16 +150,154 @@ struct StreamLauncher {
       if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true;
       return events[i].finished();
     }();
-    if (b) events[i].reset();
+    if (b) {
+      events[i].reset();
+    }
     return b;
   }
 
   Datas& datas;
-  std::vector<std::shared_ptr<StreamFunction<This>>> fns;
+  std::vector<std::shared_ptr<StreamFunction<Self>>> fns;
   std::vector<mkn::gpu::Stream> streams;
   std::vector<mkn::gpu::StreamEvent> events;
   std::vector<std::uint16_t> data_step;
-  This& self = *this;
+  Self& self = *reinterpret_cast<Self*>(this);
+};
+
+enum class SFS : std::uint16_t { FIRST = 0, BUSY, WAIT, FIN };
+enum class SFP : std::uint16_t { WORK = 0, NEXT, SKIP };
+
+template <typename Strat, typename Fn>
+struct AsyncStreamHostFunction : StreamFunction<Strat> {
+  using Super = StreamFunction<Strat>;
+  using Super::strat;
+  AsyncStreamHostFunction(Strat& strat, Fn&& fn_)
+      : Super{strat, StreamFunctionMode::HOST_WAIT}, fn{fn_} {}
+  void run(std::uint32_t const i) override {
+    fn(i);
+    strat.status[i] = SFS::WAIT;
+  }
+  Fn fn;
+};
+
+template <typename Datas>
+struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLauncher<Datas>> {
+  using This = ThreadedStreamLauncher<Datas>;
+  using Super = StreamLauncher<Datas, This>;
+  using Super::datas;
+  using Super::events;
+  using Super::fns;
+
+  constexpr static std::size_t wait_ms = 1;
+  constexpr static std::size_t wait_max_ms = 100;
+
+  ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1)
+      : Super{datas}, n_threads{_n_threads} {
+    thread_status.resize(n_threads, SFP::NEXT);
+    status.resize(datas.size(), SFS::FIRST);
+  }
+
+  ~ThreadedStreamLauncher() { join(); }
+
+  template <typename Fn>
+  This& host(Fn&& fn) {
+    fns.emplace_back(
+        std::make_shared<AsyncStreamHostFunction<This, Fn>>(*this, std::forward<Fn>(fn)));
+    return *this;
+  }
+
+  void operator()() { join(); }
+  Super& super() { return *this; }
+  void super(std::size_t const& idx) { return super()(idx); }
+
+  bool is_fn_finished(std::uint32_t i) {
+    auto const b = [&]() {
+      if (fns[step[i]]->mode == StreamFunctionMode::HOST_WAIT) return status[i] == SFS::WAIT;
+      return events[i].finished();
+    }();
+    if (b) {
+      events[i].reset();
+      status[i] = SFS::WAIT;
+    }
+    return b;
+  }
+  void thread_fn(std::size_t const& /*tid*/) {
+    std::size_t waitms = wait_ms;
+    while (!done) {
+      auto const& [ts, idx] = get_work();
+
+      if (ts == SFP::WORK) {
+        waitms = wait_ms;
+        super(idx);
+
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds(waitms));
+        waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10;
+        if (check_finished()) done = 1;
+      }
+    }
+  }
+
+  bool check_finished() {
+    for (std::size_t i = 0; i < datas.size(); ++i)
+      if (status[i] != SFS::FIN) return false;
+    return true;
+  }
+
+  std::pair<SFP, std::size_t> get_work(std::size_t const& start = 0) {
+    std::unique_lock<std::mutex> lk(work_);
+    for (std::size_t i = start; i < datas.size(); ++i) {
+      if (status[i] == SFS::BUSY) {
+        if (is_fn_finished(i)) status[i] = SFS::WAIT;
+
+      } else if (status[i] == SFS::WAIT) {
+        ++step[i];
+
+        if (Super::is_finished(i)) {
+          status[i] = SFS::FIN;
+          continue;
+        }
+
+        status[i] = SFS::BUSY;
+        return std::make_pair(SFP::WORK, i);
+
+      } else if (status[i] == SFS::FIRST) {
+        status[i] = SFS::BUSY;
+        return std::make_pair(SFP::WORK, i);
+      }
+    }
+
+    return std::make_pair(SFP::SKIP, 0);
+  }
+
+  This& join(bool const& clear = false) {
+    if (!started) start();
+    if (joined) return *this;
+    joined = true;
+
+    for (auto& t : threads) t.join();
+    if (clear) threads.clear();
+    return *this;
+  }
+
+  This& start() {
+    if (started) return *this;
+    started = 1;
+    for (std::size_t i = 0; i < n_threads; ++i)
+      threads.emplace_back([i = i, this]() { thread_fn(i); });
+    return *this;
+  }
+
+  std::size_t const n_threads = 1;
+  std::vector<std::thread> threads;
+
+  std::mutex work_;
+  std::vector<SFS> status;
+  std::vector<SFP> thread_status;
+  std::vector<std::uint16_t>& step = Super::data_step;
+
+ private:
+  bool joined = false, started = false, done = false;
 };
 
 }  // namespace mkn::gpu
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index b24ca81..ed4e7d4 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -5,10 +5,13 @@
 #include <thread>
 #include <algorithm>
 
+#include "mkn/kul/dbg.hpp"
 #include "mkn/gpu/multi_launch.hpp"
 
-std::uint32_t static constexpr NUM = 128 * 1024 * 1024;  // ~ 1GB of doubles
-std::size_t constexpr static C = 5;                      // ~ 5GB of doubles
+using namespace std::chrono_literals;
+
+std::uint32_t static constexpr NUM = 128 * 1024;  // ~ 1MB of doubles
+std::size_t constexpr static C = 5;               // ~ 5MB of doubles
 
 template <typename T>
 using ManagedVector = std::vector<T, mkn::gpu::ManagedAllocator<T>>;
@@ -20,6 +23,9 @@ struct A {
 std::uint32_t test() {
   using namespace mkn::gpu;
   using T = double;
+
+  KUL_DBG_FUNC_ENTER;
+
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
   for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
 
@@ -29,17 +35,63 @@ std::uint32_t test() {
 
   StreamLauncher{vecs}
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
-      .host([&](auto i) mutable { vecs[i][0] += 1; })
+      .host([&](auto i) mutable {
+        std::this_thread::sleep_for(200ms);
+        for (auto& e : vecs[i]) e += 1;
+      })
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })  //
+      ();
+
+  for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl;
+
+  std::size_t val = 5;
+  for (auto const& vec : vecs) {
+    for (auto const& e : vec)
+      if (e != val) return 1;
+    ++val;
+  };
+
+  return 0;
+}
+
+std::uint32_t test_threaded() {
+  using namespace mkn::gpu;
+  using T = double;
+
+  KUL_DBG_FUNC_ENTER;
+
+  std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
+  for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
+
+  ManagedVector<T*> datas(C);
+  for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
+  auto views = datas.data();
+
+  using namespace std::chrono_literals;
+
+  ThreadedStreamLauncher{vecs, 6}
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
+      .host([&](auto i) mutable {
+        std::this_thread::sleep_for(200ms);
+        for (auto& e : vecs[i]) e += 1;
+      })
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })  //
       ();
 
   for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl;
 
+  std::size_t val = 5;
+  for (auto const& vec : vecs) {
+    for (auto const& e : vec)
+      if (e != val) return 1;
+    ++val;
+  };
+
   return 0;
 }
 
 int main() {
   KOUT(NON) << __FILE__;
 
-  return test();
+  return test() + test_threaded();
 }

From f1871c44150f279f76b01e9840db8a699ceba737 Mon Sep 17 00:00:00 2001
From: p <philip.deegan@gmail.com>
Date: Sat, 27 Jul 2024 17:40:57 +0100
Subject: [PATCH 5/7] event streaming

---
 inc/mkn/gpu/cuda.hpp         | 41 ++++++++++++++++++++++++++++++------
 inc/mkn/gpu/multi_launch.hpp |  5 ++++-
 res/mkn/clang_cuda.yaml      |  3 +++
 test/any/async_streaming.cpp |  2 +-
 4 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index dd96528..df1812c 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -108,19 +108,46 @@ struct Stream {
 
 struct StreamEvent {
   StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
-  ~StreamEvent() { /*MKN_GPU_ASSERT(result = cudaEventDestroy(event));*/ }
+  ~StreamEvent() {
+    if (start) {
+      MKN_GPU_ASSERT(result = cudaEventDestroy(start))
+    }
+    if (stop) {
+      MKN_GPU_ASSERT(result = cudaEventDestroy(stop))
+    }
+  }
+
+  StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} {
+    that.start = nullptr;
+    that.stop = nullptr;
+  }
 
-  auto& operator()() { return event; };
-  void record() { MKN_GPU_ASSERT(result = cudaEventRecord(event, stream())); }
-  bool finished() const { return cudaEventQuery(event) == cudaSuccess; }
+  StreamEvent(StreamEvent const&) = delete;
+  StreamEvent& operator=(StreamEvent const&) = delete;
+
+  auto& operator()() { return stop; };
+  void record() {
+    if (stage == 0) {
+      MKN_GPU_ASSERT(result = cudaEventRecord(start, stream()));
+      ++stage;
+    } else {
+      MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream()));
+      ++stage;
+    }
+  }
+  bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; }
   void reset() {
-    if (event) MKN_GPU_ASSERT(result = cudaEventDestroy(event));
-    MKN_GPU_ASSERT(result = cudaEventCreate(&event));
+    if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start));
+    MKN_GPU_ASSERT(result = cudaEventCreate(&start));
+    if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop));
+    MKN_GPU_ASSERT(result = cudaEventCreate(&stop));
+    stage = 0;
   }
 
   Stream& stream;
   cudaError_t result;
-  cudaEvent_t event = nullptr;
+  cudaEvent_t start = nullptr, stop = nullptr;
+  std::uint16_t stage = 0;
 };
 
 template <typename T>
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index 66626b8..e46425b 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -132,6 +132,8 @@ struct StreamLauncher {
     auto const& step = data_step[i];
     assert(step < fns.size());
     fns[step]->run(i);
+    assert(i < events.size());
+    assert(step < fns.size());
     if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record();
   }
 
@@ -222,6 +224,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     return b;
   }
   void thread_fn(std::size_t const& /*tid*/) {
+    cudaSetDevice(0);  // configurable
     std::size_t waitms = wait_ms;
     while (!done) {
       auto const& [ts, idx] = get_work();
@@ -284,7 +287,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     if (started) return *this;
     started = 1;
     for (std::size_t i = 0; i < n_threads; ++i)
-      threads.emplace_back([i = i, this]() { thread_fn(i); });
+      threads.emplace_back([&, i = i]() { thread_fn(i); });
     return *this;
   }
 
diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml
index 638b4c6..7f0aedd 100644
--- a/res/mkn/clang_cuda.yaml
+++ b/res/mkn/clang_cuda.yaml
@@ -23,6 +23,9 @@ property:
 #   repo: git@github.com:mkn/
 #   mod-repo: git@github.com:mkn-mod/
 
+env: |
+  PATH=/usr/local/cuda/bin:${PATH}
+
 ##example include path for system
 #inc: /usr/local/cuda/targets/x86_64-linux/include
 path: /usr/local/cuda/targets/x86_64-linux/lib
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index ed4e7d4..45b1be2 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -92,6 +92,6 @@ std::uint32_t test_threaded() {
 
 int main() {
   KOUT(NON) << __FILE__;
-
+  cudaSetDevice(0);
   return test() + test_threaded();
 }

From 8933c64813eab1941b85c036d0119ee2dfffe986 Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Mon, 29 Jul 2024 23:18:45 +0200
Subject: [PATCH 6/7] set device

---
 inc/mkn/gpu/cpu.hpp          | 6 +++---
 inc/mkn/gpu/cuda.hpp         | 2 ++
 inc/mkn/gpu/multi_launch.hpp | 3 +--
 inc/mkn/gpu/rocm.hpp         | 5 ++++-
 test/any/async_streaming.cpp | 1 -
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index 36de6f8..5936969 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -83,9 +83,9 @@ struct dim3 {
   std::size_t x = 1, y = 1, z = 1;
 };
 
-void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {
-  // noop
-}
+void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
+
+void setDevice(std::size_t const& /*dev*/) {} /*noop*/
 
 auto supportsCooperativeLaunch(int const /*dev*/ = 0) {
   int supportsCoopLaunch = 0;
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index df1812c..7addcca 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -88,6 +88,8 @@ void setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
 }
 
+void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
+
 auto supportsCooperativeLaunch(int const dev = 0) {
   int supportsCoopLaunch = 0;
   MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index e46425b..57e7b55 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -133,7 +133,6 @@ struct StreamLauncher {
     assert(step < fns.size());
     fns[step]->run(i);
     assert(i < events.size());
-    assert(step < fns.size());
     if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record();
   }
 
@@ -224,7 +223,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     return b;
   }
   void thread_fn(std::size_t const& /*tid*/) {
-    cudaSetDevice(0);  // configurable
+    // cudaSetDevice(0);  // configurable
     std::size_t waitms = wait_ms;
     while (!done) {
       auto const& [ts, idx] = get_work();
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 6ca519c..7059d5a 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -83,6 +83,8 @@ void setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
 }
 
+void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); }
+
 auto supportsCooperativeLaunch(int const dev = 0) {
   int supportsCoopLaunch = 0;
   MKN_GPU_ASSERT(
@@ -104,7 +106,8 @@ struct Stream {
 
 struct StreamEvent {
   StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
-  ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/ }
+  ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/
+  }
 
   auto& operator()() { return event; };
   void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); }
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index 45b1be2..b79a700 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -92,6 +92,5 @@ std::uint32_t test_threaded() {
 
 int main() {
   KOUT(NON) << __FILE__;
-  cudaSetDevice(0);
   return test() + test_threaded();
 }

From a23b66e5db14477336816ba42143e8f6e8c5bee9 Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Sat, 3 Aug 2024 15:47:58 +0200
Subject: [PATCH 7/7] support barrier

---
 .github/workflows/build.yml  |  2 +-
 inc/mkn/gpu/cli.hpp          | 20 ++--------
 inc/mkn/gpu/cpu.hpp          | 11 ++++--
 inc/mkn/gpu/cuda.hpp         | 41 ++++++++++++++------
 inc/mkn/gpu/multi_launch.hpp | 75 +++++++++++++++++++++++++-----------
 inc/mkn/gpu/rocm.hpp         | 63 +++++++++++++++++++++++++-----
 mkn.yaml                     |  2 +-
 res/mkn/clang_cuda.yaml      |  2 +-
 res/mkn/hipcc.yaml           |  2 +-
 test/any/async_streaming.cpp | 17 +++-----
 10 files changed, 157 insertions(+), 78 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 810b748..3c667f2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,4 +16,4 @@ jobs:
       run: |
         curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix
         chmod +x mkn
-        KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++17" test -W 9
+        KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++20" test -W 9
diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp
index bae675c..6a40948 100644
--- a/inc/mkn/gpu/cli.hpp
+++ b/inc/mkn/gpu/cli.hpp
@@ -32,33 +32,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_CLI_HPP_
 #define _MKN_GPU_CLI_HPP_
 
-#include <optional>
-#include <type_traits>
-
 #include "mkn/kul/env.hpp"
+#include "mkn/kul/string.hpp"
 
 namespace mkn::gpu {
 
 template <typename Device>
 struct Cli {
-  //
+  constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS";
 
   auto bx_threads() const {
-    char const* ENV = "MKN_GPU_BX_THREADS";
-    if (mkn::kul::env::EXISTS(ENV)) {
-      return as<std::int32_t>(mkn::kul::env::GET(ENV));
-    }
+    if (kul::env::EXISTS(MKN_GPU_BX_THREADS))
+      return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS));
     return dev.maxThreadsPerBlock;
   }
 
-  template <typename T>
-  auto static as(std::string const& from) {
-    T t;
-    std::stringstream ss(from);
-    ss >> t;
-    return t;
-  }
-
   Device const& dev;
 };
 
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index 5936969..bedac17 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -108,12 +108,17 @@ struct StreamEvent {
   ~StreamEvent() {}
 
   auto& operator()() { return event; };
-  void record() { ; }
-  bool finished() const { return true; }
-  void reset() {}
+  auto& record() {
+    ++stage;
+    return *this;
+  }
+  auto& wait() { return *this; }
+  bool finished() const { return stage == 2; }
+  void reset() { stage = 0; }
 
   Stream stream;
   std::size_t event = 0;
+  std::uint16_t stage = 0;
 };
 
 template <typename T>
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index 7addcca..9278c6d 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cooperative_groups.h>
 #include "mkn/gpu/def.hpp"
 
-// #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess))
+//
 
 #define MKN_GPU_ASSERT(ans) \
   { gpuAssert((ans), __FILE__, __LINE__); }
@@ -110,14 +110,7 @@ struct Stream {
 
 struct StreamEvent {
   StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
-  ~StreamEvent() {
-    if (start) {
-      MKN_GPU_ASSERT(result = cudaEventDestroy(start))
-    }
-    if (stop) {
-      MKN_GPU_ASSERT(result = cudaEventDestroy(stop))
-    }
-  }
+  ~StreamEvent() { clear(); }
 
   StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} {
     that.start = nullptr;
@@ -128,7 +121,7 @@ struct StreamEvent {
   StreamEvent& operator=(StreamEvent const&) = delete;
 
   auto& operator()() { return stop; };
-  void record() {
+  auto& record() {
     if (stage == 0) {
       MKN_GPU_ASSERT(result = cudaEventRecord(start, stream()));
       ++stage;
@@ -136,12 +129,25 @@ struct StreamEvent {
       MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream()));
       ++stage;
     }
+    return *this;
+  }
+  auto& wait() {
+    if (stage == 0) {
+      MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), start));
+    } else {
+      MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), stop));
+    }
+    return *this;
+  }
+
+  void clear() {
+    if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start));
+    if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop));
   }
   bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; }
   void reset() {
-    if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start));
+    clear();
     MKN_GPU_ASSERT(result = cudaEventCreate(&start));
-    if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop));
     MKN_GPU_ASSERT(result = cudaEventCreate(&stop));
     stage = 0;
   }
@@ -356,6 +362,17 @@ void inline prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
+void print_gpu_mem_used() {
+  float free_m = 0, total_m = 0, used_m = 0;
+  std::size_t free_t = 0, total_t = 0;
+  cudaMemGetInfo(&free_t, &total_t);
+  free_m = free_t / 1048576.0;
+  total_m = total_t / 1048576.0;
+  used_m = total_m - free_m;
+  printf("  mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m,
+         total_t, total_m, used_m);
+}
+
 __device__ void grid_sync() {
   namespace cg = cooperative_groups;
   cg::grid_group grid = cg::this_grid();
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index 57e7b55..c75b820 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -31,20 +31,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_MULTI_LAUNCH_HPP_
 #define _MKN_GPU_MULTI_LAUNCH_HPP_
 
-#include <cassert>
+#include <mutex>
 #include <chrono>
-#include <iostream>
-#include <cstdint>
-#include <vector>
 #include <thread>
-#include <mutex>
+#include <vector>
+#include <barrier>
+#include <cassert>
+#include <cstdint>
+#include <iostream>
 #include <algorithm>
 
 #include "mkn/gpu.hpp"
 
 namespace mkn::gpu {
 
-enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT };
+enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER };
 enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY };
 
 template <typename Strat>
@@ -129,11 +130,14 @@ struct StreamLauncher {
   }
 
   void operator()(std::uint32_t const i) {
-    auto const& step = data_step[i];
+    auto const step = data_step[i];
+
     assert(step < fns.size());
-    fns[step]->run(i);
     assert(i < events.size());
-    if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record();
+
+    // if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) events[i].stream.sync();
+    fns[step]->run(i);
+    if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record().wait();
   }
 
   bool is_finished() const {
@@ -145,7 +149,7 @@ struct StreamLauncher {
 
   bool is_finished(std::uint32_t idx) const { return data_step[idx] == fns.size(); }
 
-  bool is_fn_finished(std::uint32_t i) {
+  bool is_fn_finished(std::uint32_t const& i) {
     auto const b = [&]() {
       auto const& step = data_step[i];
       if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true;
@@ -181,6 +185,24 @@ struct AsyncStreamHostFunction : StreamFunction<Strat> {
   Fn fn;
 };
 
+template <typename Strat>
+struct StreamBarrierFunction : StreamFunction<Strat> {
+  using Super = StreamFunction<Strat>;
+  using Super::strat;
+
+  StreamBarrierFunction(Strat& strat)
+      : Super{strat, StreamFunctionMode::BARRIER},
+        sync_point{std::ssize(strat.datas), on_completion} {}
+
+  void run(std::uint32_t const /*i*/) override { [[maybe_unused]] auto ret = sync_point.arrive(); }
+
+  std::function<void()> on_completion = [&]() {
+    for (auto& stat : strat.status) stat = SFS::WAIT;
+  };
+
+  std::barrier<decltype(on_completion)> sync_point;
+};
+
 template <typename Datas>
 struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLauncher<Datas>> {
   using This = ThreadedStreamLauncher<Datas>;
@@ -192,8 +214,9 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   constexpr static std::size_t wait_ms = 1;
   constexpr static std::size_t wait_max_ms = 100;
 
-  ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1)
-      : Super{datas}, n_threads{_n_threads} {
+  ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1,
+                         std::size_t const device = 0)
+      : Super{datas}, n_threads{_n_threads}, device_id{device} {
     thread_status.resize(n_threads, SFP::NEXT);
     status.resize(datas.size(), SFS::FIRST);
   }
@@ -207,11 +230,16 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     return *this;
   }
 
+  This& barrier() {
+    fns.emplace_back(std::make_shared<StreamBarrierFunction<This>>(*this));
+    return *this;
+  }
+
   void operator()() { join(); }
   Super& super() { return *this; }
   void super(std::size_t const& idx) { return super()(idx); }
 
-  bool is_fn_finished(std::uint32_t i) {
+  bool is_fn_finished(std::uint32_t const& i) {
     auto const b = [&]() {
       if (fns[step[i]]->mode == StreamFunctionMode::HOST_WAIT) return status[i] == SFS::WAIT;
       return events[i].finished();
@@ -222,8 +250,9 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     }
     return b;
   }
+
   void thread_fn(std::size_t const& /*tid*/) {
-    // cudaSetDevice(0);  // configurable
+    mkn::gpu::setDevice(device_id);
     std::size_t waitms = wait_ms;
     while (!done) {
       auto const& [ts, idx] = get_work();
@@ -231,12 +260,11 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
       if (ts == SFP::WORK) {
         waitms = wait_ms;
         super(idx);
-
-      } else {
-        std::this_thread::sleep_for(std::chrono::milliseconds(waitms));
-        waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10;
-        if (check_finished()) done = 1;
+        continue;
       }
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(waitms));
+      waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10;
     }
   }
 
@@ -247,12 +275,12 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   }
 
   std::pair<SFP, std::size_t> get_work(std::size_t const& start = 0) {
-    std::unique_lock<std::mutex> lk(work_);
+    std::scoped_lock<std::mutex> lk(work_);
     for (std::size_t i = start; i < datas.size(); ++i) {
       if (status[i] == SFS::BUSY) {
         if (is_fn_finished(i)) status[i] = SFS::WAIT;
-
-      } else if (status[i] == SFS::WAIT) {
+      }
+      if (status[i] == SFS::WAIT) {
         ++step[i];
 
         if (Super::is_finished(i)) {
@@ -268,7 +296,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
         return std::make_pair(SFP::WORK, i);
       }
     }
-
+    if (check_finished()) done = 1;
     return std::make_pair(SFP::SKIP, 0);
   }
 
@@ -291,6 +319,7 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   }
 
   std::size_t const n_threads = 1;
+  std::size_t const device_id = 0;
   std::vector<std::thread> threads;
 
   std::mutex work_;
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 7059d5a..9917879 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -104,22 +104,56 @@ struct Stream {
   hipStream_t stream;
 };
 
+//
+
 struct StreamEvent {
   StreamEvent(Stream& stream_) : stream{stream_} { reset(); }
-  ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/
+  ~StreamEvent() { clear(); }
+
+  StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} {
+    that.start = nullptr;
+    that.stop = nullptr;
   }
 
-  auto& operator()() { return event; };
-  void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); }
-  bool finished() const { return hipEventQuery(event) == hipSuccess; }
+  StreamEvent(StreamEvent const&) = delete;
+  StreamEvent& operator=(StreamEvent const&) = delete;
+
+  auto& operator()() { return stop; };
+  auto& record() {
+    if (stage == 0) {
+      MKN_GPU_ASSERT(result = hipEventRecord(start, stream()));
+      ++stage;
+    } else {
+      MKN_GPU_ASSERT(result = hipEventRecord(stop, stream()));
+      ++stage;
+    }
+    return *this;
+  }
+  auto& wait() {
+    if (stage == 0) {
+      MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), start));
+    } else {
+      MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), stop));
+    }
+    return *this;
+  }
+
+  void clear() {
+    if (start) MKN_GPU_ASSERT(result = hipEventDestroy(start));
+    if (stop) MKN_GPU_ASSERT(result = hipEventDestroy(stop));
+  }
+  bool finished() const { return stage == 2 and hipEventQuery(stop) == hipSuccess; }
   void reset() {
-    if (event) MKN_GPU_ASSERT(result = hipEventDestroy(event));
-    MKN_GPU_ASSERT(result = hipEventCreate(&event));
+    clear();
+    MKN_GPU_ASSERT(result = hipEventCreate(&start));
+    MKN_GPU_ASSERT(result = hipEventCreate(&stop));
+    stage = 0;
   }
 
   Stream& stream;
   hipError_t result;
-  hipEvent_t event = nullptr;
+  hipEvent_t start = nullptr, stop = nullptr;
+  std::uint16_t stage = 0;
 };
 
 // https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3
@@ -250,7 +284,7 @@ void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... arg
         if constexpr (_coop) {
           auto address_of = [](auto& a) { return (void*)&a; };
           void* kernelArgs[] = {(address_of(params), ...)};
-          hipLaunchCooperativeKernel<F>(f, g, b, kernelArgs, ds, s);
+          MKN_GPU_ASSERT(hipLaunchCooperativeKernel<F>(f, g, b, kernelArgs, ds, s));
         } else {
           hipLaunchKernelGGL(f, g, b, ds, s, params...);
         }
@@ -322,7 +356,7 @@ void fill(Container& c, T val) {
 // https://rocm-developer-tools.github.io/HIP/group__Device.html
 void inline prinfo(size_t dev = 0) {
   hipDeviceProp_t devProp;
-  [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
+  MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
   KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
   KOUT(NON) << " agent name     " << devProp.name;
   KOUT(NON) << " cores          " << devProp.multiProcessorCount;
@@ -333,6 +367,17 @@ void inline prinfo(size_t dev = 0) {
   KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
 }
 
+void print_gpu_mem_used() {
+  float free_m = 0, total_m = 0, used_m = 0;
+  std::size_t free_t = 0, total_t = 0;
+  MKN_GPU_ASSERT(hipMemGetInfo(&free_t, &total_t));
+  free_m = free_t / 1048576.0;
+  total_m = total_t / 1048576.0;
+  used_m = total_m - free_m;
+  printf("  mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m,
+         total_t, total_m, used_m);
+}
+
 __device__ void grid_sync() {
   namespace cg = cooperative_groups;
   cg::grid_group grid = cg::this_grid();
diff --git a/mkn.yaml b/mkn.yaml
index 2d7b461..9f20fa4 100644
--- a/mkn.yaml
+++ b/mkn.yaml
@@ -1,4 +1,4 @@
-#! clean build test run -Op rocm -x res/mkn/hipcc -W
+#! clean build test run -tOqp rocm -x res/mkn/hipcc -W
 
 name: mkn.gpu
 parent: headers
diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml
index 7f0aedd..5654326 100644
--- a/res/mkn/clang_cuda.yaml
+++ b/res/mkn/clang_cuda.yaml
@@ -12,7 +12,7 @@
 #
 
 property:
-  cxx_flags: --std=c++17 -fPIC -fsized-deallocation -Wno-unknown-cuda-version
+  cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version
   cxx_cuda: -x cuda --cuda-gpu-arch="sm_61" -Xclang -fcuda-allow-variadic-functions
 
 # local:
diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml
index 3cb9a0a..e123216 100644
--- a/res/mkn/hipcc.yaml
+++ b/res/mkn/hipcc.yaml
@@ -18,6 +18,6 @@ env:
 file:
   - type: cpp:cxx:cc
     archiver: ar -cr
-    compiler: hipcc -std=c++17 -fPIC
+    compiler: hipcc -std=c++20 -fPIC
     linker: hipcc
 
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index b79a700..fb0b825 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -39,10 +39,7 @@ std::uint32_t test() {
         std::this_thread::sleep_for(200ms);
         for (auto& e : vecs[i]) e += 1;
       })
-      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })  //
-      ();
-
-  for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl;
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })();
 
   std::size_t val = 5;
   for (auto const& vec : vecs) {
@@ -54,7 +51,7 @@ std::uint32_t test() {
   return 0;
 }
 
-std::uint32_t test_threaded() {
+std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
   using namespace mkn::gpu;
   using T = double;
 
@@ -69,16 +66,14 @@ std::uint32_t test_threaded() {
 
   using namespace std::chrono_literals;
 
-  ThreadedStreamLauncher{vecs, 6}
+  ThreadedStreamLauncher{vecs, nthreads}
       .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; })
       .host([&](auto i) mutable {
         std::this_thread::sleep_for(200ms);
         for (auto& e : vecs[i]) e += 1;
       })
-      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })  //
-      ();
-
-  for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl;
+      .barrier()
+      .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })();
 
   std::size_t val = 5;
   for (auto const& vec : vecs) {
@@ -92,5 +87,5 @@ std::uint32_t test_threaded() {
 
 int main() {
   KOUT(NON) << __FILE__;
-  return test() + test_threaded();
+  return test() + test_threaded() + test_threaded(6);
 }