From 4be3f10a95f260d4e3fb901e98a8336ac6af2478 Mon Sep 17 00:00:00 2001
From: Sebastian Keller <44800242+sekelle@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:15:04 +0100
Subject: [PATCH] HIP and Spack (#468)

* HIP compatibility, remove need for hipify-perl

* split GPU N-body code into additional units

handle gpu-direct with interface compile_definition
removed duplicate cartesian multipole implementation
---
 .gitlab/Dockerfile_hip_2                      |  10 +-
 CMakeLists.txt                                |  47 +-
 README.md                                     |   9 +-
 README_insitu.md                              |   1 -
 domain/CMakeLists.txt                         |  43 +-
 domain/README.md                              |   2 +-
 domain/include/cstone/CMakeLists.txt          |   4 +
 domain/include/cstone/cuda/cub.hpp            |  14 +
 domain/include/cstone/cuda/cuda_runtime.hpp   |  61 ++
 domain/include/cstone/cuda/cuda_utils.cuh     |   3 +-
 domain/include/cstone/cuda/device_vector.cu   |   5 +-
 domain/include/cstone/cuda/errorcheck.cuh     |   2 +-
 domain/include/cstone/cuda/gpu_config.cuh     |   3 +-
 domain/include/cstone/focus/rebalance_gpu.cu  |   9 +-
 .../include/cstone/focus/source_center_gpu.h  |   1 +
 .../include/cstone/halos/gather_halos_gpu.cu  |   1 +
 domain/include/cstone/primitives/mpi_cuda.cuh |   3 +-
 .../cstone/primitives/primitives_gpu.cu       |   5 +-
 domain/include/cstone/primitives/warpscan.cuh |  14 +-
 .../include/cstone/traversal/CMakeLists.txt   |   6 +-
 .../cstone/traversal/find_neighbors.cuh       |   3 +-
 domain/include/cstone/traversal/groups_gpu.cu | 161 ++++++
 .../traversal/{groups.cuh => groups_gpu.cuh}  |  92 +--
 domain/include/cstone/traversal/groups_gpu.h  |  90 +++
 domain/include/cstone/tree/csarray_gpu.cu     |   1 +
 domain/test/integration_mpi/CMakeLists.txt    |   3 -
 .../integration_mpi/exchange_halos_gpu.cpp    |   2 +-
 domain/test/performance/CMakeLists.txt        |   8 +-
 domain/test/performance/neighbor_driver.cu    |   1 -
 domain/test/performance/timing.cuh            |  19 +-
 domain/test/unit_cuda/traversal/groups.cu     |  25 +-
 main/src/analytical_solutions/CMakeLists.txt  |   7 +-
 main/src/propagator/CMakeLists.txt            |   3 -
 main/src/sphexa/CMakeLists.txt                |   8 -
 main/src/util/pm_reader.hpp                   |  63 +-
 ryoanji/CMakeLists.txt                        |  39 +-
 ryoanji/src/ryoanji/CMakeLists.txt            |   1 +
 ryoanji/src/ryoanji/interface/CMakeLists.txt  |   9 +-
 ryoanji/src/ryoanji/interface/ewald.cu        |   5 +-
 .../src/ryoanji/interface/multipole_holder.cu |  52 +-
 ryoanji/src/ryoanji/interface/treebuilder.cu  | 239 --------
 ryoanji/src/ryoanji/interface/treebuilder.cuh |  99 +++-
 ryoanji/src/ryoanji/nbody/CMakeLists.txt      |   8 +
 ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp |  21 -
 ryoanji/src/ryoanji/nbody/ewald.hpp           |   8 -
 ryoanji/src/ryoanji/nbody/kernel.hpp          | 546 +-----------------
 ryoanji/src/ryoanji/nbody/traversal.cuh       |  77 +--
 ryoanji/src/ryoanji/nbody/types.h             |  41 +-
 ryoanji/src/ryoanji/nbody/upsweep_gpu.cu      | 115 ++++
 ryoanji/src/ryoanji/nbody/upsweep_gpu.h       |  58 ++
 ryoanji/src/ryoanji/nbody/upwardpass.cuh      | 235 --------
 ryoanji/test/CMakeLists.txt                   |   9 +-
 ryoanji/test/demo.cu                          | 143 ++++-
 ryoanji/test/demo_mpi.cpp                     |   5 +-
 ryoanji/test/interface/CMakeLists.txt         |   4 -
 ryoanji/test/interface/global_upsweep_gpu.cpp |   2 +-
 ryoanji/test/nbody/ewald.cu                   |   2 +-
 ryoanji/test/nbody/ewald_cpu.cpp              |  14 +-
 ryoanji/test/nbody/kernel.cpp                 |  62 --
 ryoanji/test/nbody/traversal_cpu.cpp          |   4 -
 scripts/CMakeLists.txt                        |   6 +
 scripts/plot_power.py                         | 117 ++++
 scripts/spack/package.py                      |  54 --
 sph/include/sph/CMakeLists.txt                |   2 +-
 sph/include/sph/groups.cu                     |  52 --
 sph/include/sph/groups.hpp                    |  41 ++
 sph/include/sph/hydro_std/iad_gpu.cu          |   1 -
 .../sph/hydro_std/momentum_energy_gpu.cu      |  12 +-
 sph/include/sph/hydro_ve/av_switches_gpu.cu   |   1 -
 .../sph/hydro_ve/iad_divv_curlv_gpu.cu        |   1 -
 .../sph/hydro_ve/momentum_energy_gpu.cu       |  11 +-
 sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu  |   1 -
 sph/include/sph/hydro_ve/xmass_gpu.cu         |   9 +-
 sph/include/sph/particles_data.hpp            |   1 -
 sph/include/sph/particles_data_gpu.cuh        |   1 -
 sph/include/sph/sph_gpu.hpp                   |   7 +-
 sph/include/sph/util/device_math.cuh          |  40 --
 sph/include/sph/util/pinned_allocator.cuh     | 178 ------
 78 files changed, 1149 insertions(+), 1913 deletions(-)
 create mode 100644 domain/include/cstone/cuda/cub.hpp
 create mode 100644 domain/include/cstone/cuda/cuda_runtime.hpp
 create mode 100644 domain/include/cstone/traversal/groups_gpu.cu
 rename domain/include/cstone/traversal/{groups.cuh => groups_gpu.cuh} (71%)
 create mode 100644 domain/include/cstone/traversal/groups_gpu.h
 delete mode 100644 ryoanji/src/ryoanji/interface/treebuilder.cu
 create mode 100644 ryoanji/src/ryoanji/nbody/CMakeLists.txt
 create mode 100644 ryoanji/src/ryoanji/nbody/upsweep_gpu.cu
 create mode 100644 ryoanji/src/ryoanji/nbody/upsweep_gpu.h
 delete mode 100644 ryoanji/src/ryoanji/nbody/upwardpass.cuh
 create mode 100644 scripts/CMakeLists.txt
 create mode 100755 scripts/plot_power.py
 delete mode 100644 scripts/spack/package.py
 delete mode 100644 sph/include/sph/groups.cu
 delete mode 100644 sph/include/sph/util/device_math.cuh
 delete mode 100644 sph/include/sph/util/pinned_allocator.cuh

diff --git a/.gitlab/Dockerfile_hip_2 b/.gitlab/Dockerfile_hip_2
index cd3125949..99e205b50 100644
--- a/.gitlab/Dockerfile_hip_2
+++ b/.gitlab/Dockerfile_hip_2
@@ -93,8 +93,6 @@ RUN echo \
     && unset GCC_X86_64 \
     && cd /usr/local/games/SPH-EXA.git \
     && ls -la . \
-    && parallel -j+0 hipify-perl -inplace ::: \
-       `find . -name '*.h' -o -name '*.cuh' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cu'` \
     && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./domain/cmake/setup_GTest.cmake \
     && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./cmake/setup_GTest.cmake \
     && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./ryoanji/cmake/setup_GTest.cmake \
@@ -106,9 +104,11 @@ RUN echo \
        MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \
        cmake -S SPH-EXA.git -B build \
        -DCMAKE_CXX_COMPILER=mpicxx -DCMAKE_C_COMPILER=mpicc \
-       -DBUILD_TESTING=OFF -DBUILD_ANALYTICAL=OFF -DGPU_DIRECT=OFF \
-       -DCMAKE_BUILD_TYPE=Debug -DCMAKE_HIP_ARCHITECTURES=gfx90a \
-    && echo "## cmake --build + --install :" \
+       -DCMAKE_HIP_COMPILER=mpicxx -DCMAKE_HIP_COMPILER_FORCED=ON \
+       -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+       -DCSTONE_WITH_GPU_AWARE_MPI=ON \
+       -DBUILD_TESTING=OFF \
+    && echo "## cmake --build :" \
     && MPICH_CXX=/opt/rocm-5.2.3/llvm/bin/clang++ \
        MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \
        cmake --build build -j `grep processor /proc/cpuinfo | wc -l` -t sphexa-hip
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bbefe5a7..689003cc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,8 @@ list(APPEND CMAKE_MODULE_PATH
 option(BUILD_TESTING "build unit and integration tests" ON)
 option(BUILD_ANALYTICAL "build analytical solution" ON)
 
-option(GPU_DIRECT "Enable CUDA-aware MPI communication" OFF)
+option(SPH_EXA_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
+option(SPH_EXA_WITH_HIP "Enable building for AMD GPUs" ON)
 
 set(CSTONE_DIR ${PROJECT_SOURCE_DIR}/domain/include)
 set(CSTONE_TEST_DIR ${PROJECT_SOURCE_DIR}/domain/test)
@@ -46,18 +47,32 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-    find_package(CUDAToolkit)
-    set(CMAKE_CUDA_STANDARD 17)
+if(SPH_EXA_WITH_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        find_package(CUDAToolkit)
+        set(CMAKE_CUDA_STANDARD 17)
+    else()
+        message(STATUS "No CUDA support")
+        set(SPH_EXA_WITH_CUDA OFF)
+    endif()
+endif()
+
+if(SPH_EXA_WITH_HIP)
+    check_language(HIP)
+    if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
+        enable_language(HIP)
+        find_package(hip)
+        set(CMAKE_HIP_STANDARD 17)
+    else()
+        message(STATUS "No HIP support")
+        set(SPH_EXA_WITH_HIP OFF)
+    endif()
 endif()
 
-check_language(HIP)
-if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
-    enable_language(HIP)
-    find_package(hip)
-    set(CMAKE_HIP_STANDARD 17)
+if(SPH_EXA_WITH_HIP AND SPH_EXA_WITH_CUDA)
+    message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
 endif()
 
 option(SPH_EXA_WITH_H5PART "Enable HDF5 IO using the H5Part library" ON)
@@ -81,15 +96,6 @@ elseif(INSITU STREQUAL "Ascent")
 	find_package(Ascent REQUIRED PATHS "$ENV{EBROOTASCENT}/lib/cmake/ascent")
 endif()
 
-option(SPH_EXA_WITH_FFTW "Enable use of the FFTW library" ON)
-if (SPH_EXA_WITH_FFTW)
-    find_package(FFTW)
-    if (NOT FFTW_FOUND)
-        message(STATUS "No FFTW support")
-        set(SPH_EXA_WITH_FFTW OFF)
-    endif ()
-endif ()
-
 option(SPH_EXA_WITH_GRACKLE "Enable radiative cooling with GRACKLE" OFF)
 if (SPH_EXA_WITH_GRACKLE)
     add_subdirectory(extern/grackle)
@@ -97,6 +103,7 @@ endif()
 
 add_subdirectory(domain)
 add_subdirectory(ryoanji)
+add_subdirectory(scripts)
 add_subdirectory(sph)
 add_subdirectory(physics)
 add_subdirectory(main)
diff --git a/README.md b/README.md
index 0973f02eb..e4c46986a 100644
--- a/README.md
+++ b/README.md
@@ -105,11 +105,12 @@ cd build
 CC=cc CXX=CC cmake -DCMAKE_CUDA_ARCHITECTURES=60 -S <GIT_SOURCE_DIR>
 
 ```
-Module and CMake configuration on LUMI
+Module and CMake configuration on LUMI (ROCm 6.2.2)
 ```shell
-module load CrayEnv buildtools/22.12 craype-accel-amd-gfx90a rocm cray-hdf5-parallel
-cd <GIT_SOURCE_DIR>; hipify-perl -inplace `find -name *.cu -o -name *.cuh` && find -name *.prehip -delete
-cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCMAKE_HIP_COMPILER_FORCED=ON -DGPU_DIRECT=<ON/OFF> -S <GIT_SOURCE_DIR>
+module swap PrgEnv-cray PrgEnv-gnu
+module load CrayEnv buildtools craype-accel-amd-gfx90a rocm cray-hdf5-parallel
+cd <GIT_SOURCE_DIR>;
+cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCSTONE_WITH_GPU_AWARE_MPI=ON -S <GIT_SOURCE_DIR>
 ```
 
 Build everything: ```make -j```
diff --git a/README_insitu.md b/README_insitu.md
index 256d791e5..1ee78b96a 100644
--- a/README_insitu.md
+++ b/README_insitu.md
@@ -92,7 +92,6 @@ module load ParaView/5.10.1-CrayGNU-21.09-EGL
     -DBUILD_ANALYTICAL:BOOL=OFF \
     -DBUILD_TESTING:BOOL=OFF \
     -DSPH_EXA_WITH_H5PART:BOOL=OFF \
-    -DSPH_EXA_WITH_FFTW:BOOL=OFF \
     -DCMAKE_CXX_COMPILER=CC \
     -DINSITU=Ascent
 
diff --git a/domain/CMakeLists.txt b/domain/CMakeLists.txt
index d2e468131..8a6aa808e 100644
--- a/domain/CMakeLists.txt
+++ b/domain/CMakeLists.txt
@@ -26,18 +26,41 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-    find_package(CUDAToolkit)
-    set(CMAKE_CUDA_STANDARD 17)
+option(CSTONE_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
+option(CSTONE_WITH_HIP "Enable building for AMD GPUs" ON)
+option(CSTONE_WITH_GPU_AWARE_MPI "Enable CUDA-aware MPI communication" OFF)
+
+if(GPU_DIRECT)
+    message(WARNING "Option GPU_DIRECT is deprecated and will be removed. Use -DCSTONE_WITH_GPU_AWARE_MPI=ON instead.")
+    set(CSTONE_WITH_GPU_AWARE_MPI ON)
+endif()
+
+if(CSTONE_WITH_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        find_package(CUDAToolkit)
+        set(CMAKE_CUDA_STANDARD 17)
+    else()
+        message(STATUS "No CUDA support")
+        set(CSTONE_WITH_CUDA OFF)
+    endif()
+endif()
+
+if(CSTONE_WITH_HIP)
+    check_language(HIP)
+    if(CMAKE_HIP_COMPILER)
+        enable_language(HIP)
+        find_package(hip)
+        set(CMAKE_HIP_STANDARD 17)
+    else()
+        message(STATUS "No HIP support")
+        set(CSTONE_WITH_HIP OFF)
+    endif()
 endif()
 
-check_language(HIP)
-if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
-    enable_language(HIP)
-    find_package(hip)
-    set(CMAKE_HIP_STANDARD 17)
+if(CSTONE_WITH_HIP AND CSTONE_WITH_CUDA)
+    message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
 endif()
 
 add_subdirectory(include)
diff --git a/domain/README.md b/domain/README.md
index 8b886081e..8a18f03e3 100644
--- a/domain/README.md
+++ b/domain/README.md
@@ -39,7 +39,7 @@ CUDA version: 11.6 or later, HIP version 5.2 or later.
 
 Example CMake invocation:
 ```shell
-CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60,80,90 -DGPU_DIRECT=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
+CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60;80;90 -DCSTONE_WITH_GPU_AWARE_MPI=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
 ```
 
 GPU-direct (RDMA) MPI communication can be turned on or off by supplying `-D GPU_DIRECT=ON`. Default is `OFF`.
diff --git a/domain/include/cstone/CMakeLists.txt b/domain/include/cstone/CMakeLists.txt
index 24066e4b4..f5a58e76d 100644
--- a/domain/include/cstone/CMakeLists.txt
+++ b/domain/include/cstone/CMakeLists.txt
@@ -14,4 +14,8 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
                            $<TARGET_OBJECTS:traversal_obj>
                            $<TARGET_OBJECTS:source_center_gpu_obj>
                            $<TARGET_OBJECTS:gpu_utils_obj>)
+
+    if (CSTONE_WITH_GPU_AWARE_MPI)
+        target_compile_definitions(cstone_gpu INTERFACE CSTONE_HAVE_GPU_AWARE_MPI)
+    endif()
 endif ()
\ No newline at end of file
diff --git a/domain/include/cstone/cuda/cub.hpp b/domain/include/cstone/cuda/cub.hpp
new file mode 100644
index 000000000..870eeb023
--- /dev/null
+++ b/domain/include/cstone/cuda/cub.hpp
@@ -0,0 +1,14 @@
+
+#pragma once
+
+#ifdef __HIPCC__
+
+#include <hipcub/hipcub.hpp>
+
+namespace cub = hipcub;
+
+#else
+
+#include <cub/cub.cuh>
+
+#endif
diff --git a/domain/include/cstone/cuda/cuda_runtime.hpp b/domain/include/cstone/cuda/cuda_runtime.hpp
new file mode 100644
index 000000000..6a33bcde6
--- /dev/null
+++ b/domain/include/cstone/cuda/cuda_runtime.hpp
@@ -0,0 +1,61 @@
+/*! @file
+ * @brief CUDA/HIP runtime API compatiblity wrapper
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ */
+
+#pragma once
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+
+#include <hip/hip_runtime.h>
+
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaError_t hipError_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipMallocHost
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAttachGlobal hipMemAttachGlobal
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemoryTypeDevice hipMemoryTypeDevice
+#define cudaMemoryTypeManaged hipMemoryTypeManaged
+#define cudaMemset hipMemset
+#define cudaPointerAttributes hipPointerAttribute_t
+#define cudaPointerGetAttributes hipPointerGetAttributes
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+
+#define GPU_SYMBOL HIP_SYMBOL
+
+#else
+
+#include <cuda_runtime.h>
+
+#define GPU_SYMBOL(x) x
+
+#endif
diff --git a/domain/include/cstone/cuda/cuda_utils.cuh b/domain/include/cstone/cuda/cuda_utils.cuh
index 9ff7809d9..c03042b03 100644
--- a/domain/include/cstone/cuda/cuda_utils.cuh
+++ b/domain/include/cstone/cuda/cuda_utils.cuh
@@ -7,7 +7,8 @@
 #pragma once
 
 #include <type_traits>
-#include <cuda_runtime.h>
+#include <vector>
+#include "cuda_runtime.hpp"
 
 #include "device_vector.h"
 #include "cuda_stubs.h"
diff --git a/domain/include/cstone/cuda/device_vector.cu b/domain/include/cstone/cuda/device_vector.cu
index 38082d1a9..850d6ef9b 100644
--- a/domain/include/cstone/cuda/device_vector.cu
+++ b/domain/include/cstone/cuda/device_vector.cu
@@ -8,6 +8,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 
+#include "cstone/cuda/cuda_runtime.hpp"
+#include "cstone/cuda/errorcheck.cuh"
 #include "cstone/util/noinit_thrust.cuh"
 
 #include "device_vector.h"
@@ -83,7 +85,7 @@ DeviceVector<T>::DeviceVector(const T* first, const T* last)
 {
     auto size = last - first;
     impl_->resize(size);
-    cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice);
+    checkGpuErrors(cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice));
 }
 
 template<class T>
@@ -174,6 +176,7 @@ template class DeviceVector<util::array<int, 2>>;
 template class DeviceVector<util::array<int, 3>>;
 template class DeviceVector<util::array<unsigned, 1>>;
 template class DeviceVector<util::array<uint64_t, 1>>;
+template class DeviceVector<util::array<uint64_t, 2>>;
 template class DeviceVector<util::array<unsigned, 2>>;
 template class DeviceVector<util::array<float, 3>>;
 template class DeviceVector<util::array<double, 3>>;
diff --git a/domain/include/cstone/cuda/errorcheck.cuh b/domain/include/cstone/cuda/errorcheck.cuh
index cd652942f..9784668ee 100644
--- a/domain/include/cstone/cuda/errorcheck.cuh
+++ b/domain/include/cstone/cuda/errorcheck.cuh
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <cstdio>
-#include <cuda_runtime.h>
+#include "cuda_runtime.hpp"
 
 inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
 {
diff --git a/domain/include/cstone/cuda/gpu_config.cuh b/domain/include/cstone/cuda/gpu_config.cuh
index e7fd19343..f2b0713d0 100644
--- a/domain/include/cstone/cuda/gpu_config.cuh
+++ b/domain/include/cstone/cuda/gpu_config.cuh
@@ -32,8 +32,7 @@
 #pragma once
 
 #include <cstdint>
-#include <cuda_runtime.h>
-
+#include "cstone/cuda/cuda_runtime.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 
 namespace cstone
diff --git a/domain/include/cstone/focus/rebalance_gpu.cu b/domain/include/cstone/focus/rebalance_gpu.cu
index 972d074e0..d1e6587a1 100644
--- a/domain/include/cstone/focus/rebalance_gpu.cu
+++ b/domain/include/cstone/focus/rebalance_gpu.cu
@@ -28,8 +28,7 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
-#include <cub/cub.cuh>
-
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 #include "cstone/focus/rebalance.hpp"
 #include "cstone/focus/rebalance_gpu.h"
@@ -159,7 +158,7 @@ bool protectAncestorsGpu(const KeyType* prefixes,
     protectAncestorsKernel<<<iceil(numNodes, numThreads), numThreads>>>(prefixes, parents, nodeOps, numNodes);
 
     int numNodesModify;
-    checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, nodeOpSum, sizeof(int)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, GPU_SYMBOL(nodeOpSum), sizeof(int)));
 
     return numNodesModify == 0;
 }
@@ -197,7 +196,7 @@ ResolutionStatus enforceKeysGpu(const KeyType* forcedKeys,
     }
 
     int status;
-    checkGpuErrors(cudaMemcpyFromSymbol(&status, enforceKeyStatus_device, sizeof(ResolutionStatus)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&status, GPU_SYMBOL(enforceKeyStatus_device), sizeof(ResolutionStatus)));
     return static_cast<ResolutionStatus>(status);
 }
 
@@ -215,4 +214,4 @@ template ResolutionStatus enforceKeysGpu(const uint64_t* forcedKeys,
                                          const TreeNodeIndex* parents,
                                          TreeNodeIndex* nodeOps);
 
-} // namespace cstone
\ No newline at end of file
+} // namespace cstone
diff --git a/domain/include/cstone/focus/source_center_gpu.h b/domain/include/cstone/focus/source_center_gpu.h
index dc573f6ad..c69e4a4e9 100644
--- a/domain/include/cstone/focus/source_center_gpu.h
+++ b/domain/include/cstone/focus/source_center_gpu.h
@@ -30,6 +30,7 @@
 
 #pragma once
 
+#include "cstone/focus/source_center.hpp"
 #include "cstone/tree/definitions.h"
 
 namespace cstone
diff --git a/domain/include/cstone/halos/gather_halos_gpu.cu b/domain/include/cstone/halos/gather_halos_gpu.cu
index 1ffb74378..f646efda0 100644
--- a/domain/include/cstone/halos/gather_halos_gpu.cu
+++ b/domain/include/cstone/halos/gather_halos_gpu.cu
@@ -30,6 +30,7 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
+#include "cstone/cuda/cuda_runtime.hpp"
 #include "cstone/primitives/math.hpp"
 #include "cstone/primitives/stl.hpp"
 #include "cstone/util/array.hpp"
diff --git a/domain/include/cstone/primitives/mpi_cuda.cuh b/domain/include/cstone/primitives/mpi_cuda.cuh
index 8e44f9067..01a647caf 100644
--- a/domain/include/cstone/primitives/mpi_cuda.cuh
+++ b/domain/include/cstone/primitives/mpi_cuda.cuh
@@ -32,13 +32,12 @@
 #pragma once
 
 #include <vector>
-#include <cuda_runtime.h>
 
 #include "cstone/primitives/mpi_wrappers.hpp"
 #include "cstone/util/noinit_alloc.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 
-#ifdef USE_GPU_DIRECT
+#ifdef CSTONE_HAVE_GPU_AWARE_MPI
 constexpr inline bool useGpuDirect = true;
 #else
 constexpr inline bool useGpuDirect = false;
diff --git a/domain/include/cstone/primitives/primitives_gpu.cu b/domain/include/cstone/primitives/primitives_gpu.cu
index 5a13a6638..1d3181cfa 100644
--- a/domain/include/cstone/primitives/primitives_gpu.cu
+++ b/domain/include/cstone/primitives/primitives_gpu.cu
@@ -29,8 +29,6 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
-#include <cub/cub.cuh>
-
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -39,6 +37,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 #include "cstone/primitives/math.hpp"
 #include "cstone/util/array.hpp"
@@ -297,7 +296,7 @@ void sortByKeyGpu(KeyType* first, KeyType* last, ValueType* values, KeyType* key
     // Determine temporary device storage requirements
     void* d_tempStorage     = nullptr;
     size_t tempStorageBytes = 0;
-    cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements);
+    checkGpuErrors(cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements));
 
     // Allocate temporary storage
     checkGpuErrors(cudaMalloc(&d_tempStorage, tempStorageBytes));
diff --git a/domain/include/cstone/primitives/warpscan.cuh b/domain/include/cstone/primitives/warpscan.cuh
index bf269e70c..66be99028 100644
--- a/domain/include/cstone/primitives/warpscan.cuh
+++ b/domain/include/cstone/primitives/warpscan.cuh
@@ -205,8 +205,6 @@ __device__ __forceinline__ GpuConfig::ThreadMask lanemask_le()
  */
 __device__ __forceinline__ int inclusiveSegscan(int value, int distance)
 {
-    // distance should be less-equal the lane index
-    assert(distance <= (threadIdx.x & (GpuConfig::warpSize - 1)));
 #pragma unroll
     for (int i = 1; i < GpuConfig::warpSize; i *= 2)
     {
@@ -242,7 +240,6 @@ __device__ __forceinline__ int inclusiveSegscanInt(const int packedValue, const
     // distance = number of preceding lanes to include in scanned value
     // e.g. if distance = 0, then no preceding lane value will be added to scannedValue
     int distance = countLeadingZeros(flags & lanemask_le()) + laneIdx - (GpuConfig::warpSize - 1);
-    assert(distance >= 0);
     int scannedValue = inclusiveSegscan(value, imin(distance, laneIdx));
 
     // the lowest lane index for which packedValue was negative, warpSize if all were positive
@@ -299,4 +296,13 @@ __device__ __forceinline__ int spreadSeg8(int val)
     return shflSync(val, laneIdx >> 3) + (laneIdx & 7);
 }
 
-} // namespace cstone
\ No newline at end of file
+__device__ __forceinline__ float atomicMinFloat(float* addr, float value)
+{
+    float old;
+    old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
+                       : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+
+    return old;
+}
+
+} // namespace cstone
diff --git a/domain/include/cstone/traversal/CMakeLists.txt b/domain/include/cstone/traversal/CMakeLists.txt
index 55a184101..d9aa0a084 100644
--- a/domain/include/cstone/traversal/CMakeLists.txt
+++ b/domain/include/cstone/traversal/CMakeLists.txt
@@ -1,8 +1,8 @@
 if(CMAKE_HIP_COMPILER)
-    set_source_files_properties(collisions_gpu.cu PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(collisions_gpu.cu groups_gpu.cu PROPERTIES LANGUAGE HIP)
 endif()
 
 if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
-    add_library(traversal_obj OBJECT collisions_gpu.cu)
+    add_library(traversal_obj OBJECT collisions_gpu.cu groups_gpu.cu)
     target_include_directories(traversal_obj PRIVATE ${PROJECT_SOURCE_DIR}/include)
-endif()
\ No newline at end of file
+endif()
diff --git a/domain/include/cstone/traversal/find_neighbors.cuh b/domain/include/cstone/traversal/find_neighbors.cuh
index c7899197a..8c0ae7c3f 100644
--- a/domain/include/cstone/traversal/find_neighbors.cuh
+++ b/domain/include/cstone/traversal/find_neighbors.cuh
@@ -1,8 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/domain/include/cstone/traversal/groups_gpu.cu b/domain/include/cstone/traversal/groups_gpu.cu
new file mode 100644
index 000000000..c0cf61148
--- /dev/null
+++ b/domain/include/cstone/traversal/groups_gpu.cu
@@ -0,0 +1,161 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*! @file
+ * @brief Particle target grouping
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ */
+
+#include <stdexcept>
+
+#include "cstone/cuda/device_vector.h"
+#include "cstone/primitives/math.hpp"
+#include "cstone/traversal/groups_gpu.cuh"
+
+namespace cstone
+{
+
+//! @brief simple-fixed width group targets
+__global__ static void
+fixedGroupsKernel(LocalIndex first, LocalIndex last, unsigned groupSize, LocalIndex* groups, unsigned numGroups)
+{
+    LocalIndex tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    LocalIndex groupIdx = tid / groupSize;
+    if (groupIdx >= numGroups) { return; }
+
+    if (tid == groupIdx * groupSize)
+    {
+        LocalIndex scan  = first + groupIdx * groupSize;
+        groups[groupIdx] = scan;
+        if (groupIdx + 1 == numGroups) { groups[numGroups] = last; }
+    }
+}
+
+void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData<GpuTag>& groups)
+{
+    LocalIndex numBodies = last - first;
+    LocalIndex numGroups = iceil(numBodies, groupSize);
+    groups.data.resize(numGroups + 1);
+    fixedGroupsKernel<<<iceil(numBodies, 256), 256>>>(first, last, groupSize, rawPtr(groups.data), numGroups);
+
+    groups.firstBody  = first;
+    groups.lastBody   = last;
+    groups.numGroups  = numGroups;
+    groups.groupStart = rawPtr(groups.data);
+    groups.groupEnd   = rawPtr(groups.data) + 1;
+}
+
+//! @brief convenience wrapper for groupSplitsKernel
+template<unsigned groupSize, class Tc, class T, class KeyType>
+void computeGroupSplitsImpl(
+    LocalIndex first,
+    LocalIndex last,
+    const Tc* x,
+    const Tc* y,
+    const Tc* z,
+    const T* h,
+    const KeyType* leaves,
+    TreeNodeIndex numLeaves,
+    const LocalIndex* layout,
+    const Box<Tc> box,
+    float tolFactor,
+    DeviceVector<util::array<GpuConfig::ThreadMask, groupSize / GpuConfig::warpSize>>& splitMasks,
+    DeviceVector<LocalIndex>& numSplitsPerGroup,
+    DeviceVector<LocalIndex>& groups)
+{
+    LocalIndex numParticles   = last - first;
+    LocalIndex numFixedGroups = iceil(numParticles, groupSize);
+    unsigned numThreads       = 256;
+    unsigned gridSize         = numFixedGroups * GpuConfig::warpSize;
+
+    splitMasks.resize(numFixedGroups);
+
+    numSplitsPerGroup.reserve(numFixedGroups * 1.1);
+    numSplitsPerGroup.resize(numFixedGroups);
+
+    groupSplitsKernel<groupSize><<<iceil(gridSize, numThreads), numThreads>>>(
+        first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor, rawPtr(splitMasks),
+        rawPtr(numSplitsPerGroup), numFixedGroups);
+
+    groups.reserve(numFixedGroups * 1.1);
+    groups.resize(numFixedGroups + 1);
+    exclusiveScanGpu(rawPtr(numSplitsPerGroup), rawPtr(numSplitsPerGroup) + numFixedGroups + 1, rawPtr(groups));
+    LocalIndex newNumGroups;
+    memcpyD2H(rawPtr(groups) + groups.size() - 1, 1, &newNumGroups);
+
+    auto& newGroupSizes = numSplitsPerGroup;
+    newGroupSizes.resize(newNumGroups);
+
+    makeSplitsKernel<<<numFixedGroups, numThreads>>>(rawPtr(splitMasks), rawPtr(groups), numFixedGroups,
+                                                     rawPtr(newGroupSizes));
+
+    groups.resize(newNumGroups + 1);
+    exclusiveScanGpu(rawPtr(newGroupSizes), rawPtr(newGroupSizes) + newNumGroups + 1, rawPtr(groups), first);
+    memcpyH2D(&last, 1, rawPtr(groups) + groups.size() - 1);
+}
+
+template<class Tc, class T, class KeyType>
+void computeGroupSplits(LocalIndex first,
+                        LocalIndex last,
+                        const Tc* x,
+                        const Tc* y,
+                        const Tc* z,
+                        const T* h,
+                        const KeyType* leaves,
+                        TreeNodeIndex numLeaves,
+                        const LocalIndex* layout,
+                        const Box<Tc> box,
+                        unsigned groupSize,
+                        float tolFactor,
+                        DeviceVector<LocalIndex>& numSplitsPerGroup,
+                        DeviceVector<LocalIndex>& groups)
+{
+    if (groupSize == GpuConfig::warpSize)
+    {
+        DeviceVector<util::array<GpuConfig::ThreadMask, 1>> splitMasks;
+        computeGroupSplitsImpl<GpuConfig::warpSize>(first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor,
+                                                    splitMasks, numSplitsPerGroup, groups);
+    }
+    else if (groupSize == 2 * GpuConfig::warpSize)
+    {
+        DeviceVector<util::array<GpuConfig::ThreadMask, 2>> splitMasks;
+        computeGroupSplitsImpl<2 * GpuConfig::warpSize>(first, last, x, y, z, h, leaves, numLeaves, layout, box,
+                                                        tolFactor, splitMasks, numSplitsPerGroup, groups);
+    }
+    else { throw std::runtime_error("Unsupported spatial group size\n"); }
+}
+
+#define COMPUTE_GROUP_SPLITS(Tc, T, KeyType)                                                                           \
+    template void computeGroupSplits(LocalIndex first, LocalIndex last, const Tc* x, const Tc* y, const Tc* z,         \
+                                     const T* h, const KeyType* leaves, TreeNodeIndex numLeaves,                       \
+                                     const LocalIndex* layout, const Box<Tc> box, unsigned groupSize, float tolFactor, \
+                                     DeviceVector<LocalIndex>& numSplitsPerGroup, DeviceVector<LocalIndex>& groups);
+
+COMPUTE_GROUP_SPLITS(double, double, uint64_t);
+COMPUTE_GROUP_SPLITS(double, float, uint64_t);
+COMPUTE_GROUP_SPLITS(float, float, uint64_t);
+
+} // namespace cstone
diff --git a/domain/include/cstone/traversal/groups.cuh b/domain/include/cstone/traversal/groups_gpu.cuh
similarity index 71%
rename from domain/include/cstone/traversal/groups.cuh
rename to domain/include/cstone/traversal/groups_gpu.cuh
index 319af4710..4dc6c85b7 100644
--- a/domain/include/cstone/traversal/groups.cuh
+++ b/domain/include/cstone/traversal/groups_gpu.cuh
@@ -1,7 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2023 CSCS, ETH Zurich, University of Basel, University of Zurich
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -23,56 +23,24 @@
  */
 
 /*! @file
- * @brief Particle target grouping
+ * @brief Particle target grouping.
  *
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ *
+ * Separate header to allow inclusion in library as well as in unit testing of device-side functions
  */
 
-#pragma once
-
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/cuda/device_vector.h"
 #include "cstone/cuda/gpu_config.cuh"
-#include "cstone/primitives/math.hpp"
 #include "cstone/primitives/primitives_gpu.h"
 #include "cstone/primitives/warpscan.cuh"
 #include "cstone/sfc/sfc.hpp"
-#include "cstone/traversal/groups.hpp"
+#include "cstone/traversal/groups_gpu.h"
+#include "cstone/util/array.hpp"
 
 namespace cstone
 {
 
-//! @brief simple-fixed width group targets
-__global__ static void
-fixedGroupsKernel(LocalIndex first, LocalIndex last, unsigned groupSize, LocalIndex* groups, unsigned numGroups)
-{
-    LocalIndex tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    LocalIndex groupIdx = tid / groupSize;
-    if (groupIdx >= numGroups) { return; }
-
-    if (tid == groupIdx * groupSize)
-    {
-        LocalIndex scan  = first + groupIdx * groupSize;
-        groups[groupIdx] = scan;
-        if (groupIdx + 1 == numGroups) { groups[numGroups] = last; }
-    }
-}
-
-static void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData<GpuTag>& groups)
-{
-    LocalIndex numBodies = last - first;
-    LocalIndex numGroups = iceil(numBodies, groupSize);
-    groups.data.resize(numGroups + 1);
-    fixedGroupsKernel<<<iceil(numBodies, 256), 256>>>(first, last, groupSize, rawPtr(groups.data), numGroups);
-
-    groups.firstBody  = first;
-    groups.lastBody   = last;
-    groups.numGroups  = numGroups;
-    groups.groupStart = rawPtr(groups.data);
-    groups.groupEnd   = rawPtr(groups.data) + 1;
-}
-
 /*! @brief Computes splitting patterns of target particle groups
  *
  * @tparam    T            float or double
@@ -263,52 +231,4 @@ __global__ void groupSplitsKernel(LocalIndex first,
     }
 }
 
-//! @brief convenience wrapper for groupSplitsKernel
-template<unsigned groupSize, class Tc, class T, class KeyType>
-void computeGroupSplits(LocalIndex first,
-                        LocalIndex last,
-                        const Tc* x,
-                        const Tc* y,
-                        const Tc* z,
-                        const T* h,
-                        const KeyType* leaves,
-                        TreeNodeIndex numLeaves,
-                        const LocalIndex* layout,
-                        const Box<Tc> box,
-                        float tolFactor,
-                        DeviceVector<util::array<GpuConfig::ThreadMask, groupSize / GpuConfig::warpSize>>& splitMasks,
-                        DeviceVector<LocalIndex>& numSplitsPerGroup,
-                        DeviceVector<LocalIndex>& groups)
-{
-    LocalIndex numParticles   = last - first;
-    LocalIndex numFixedGroups = iceil(numParticles, groupSize);
-    unsigned numThreads       = 256;
-    unsigned gridSize         = numFixedGroups * GpuConfig::warpSize;
-
-    splitMasks.resize(numFixedGroups);
-
-    numSplitsPerGroup.reserve(numFixedGroups * 1.1);
-    numSplitsPerGroup.resize(numFixedGroups);
-
-    groupSplitsKernel<groupSize><<<iceil(gridSize, numThreads), numThreads>>>(
-        first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor, rawPtr(splitMasks),
-        rawPtr(numSplitsPerGroup), numFixedGroups);
-
-    groups.reserve(numFixedGroups * 1.1);
-    groups.resize(numFixedGroups + 1);
-    exclusiveScanGpu(rawPtr(numSplitsPerGroup), rawPtr(numSplitsPerGroup) + numFixedGroups + 1, rawPtr(groups));
-    LocalIndex newNumGroups;
-    memcpyD2H(rawPtr(groups) + groups.size() - 1, 1, &newNumGroups);
-
-    auto& newGroupSizes = numSplitsPerGroup;
-    newGroupSizes.resize(newNumGroups);
-
-    makeSplitsKernel<<<numFixedGroups, numThreads>>>(rawPtr(splitMasks), rawPtr(groups), numFixedGroups,
-                                                     rawPtr(newGroupSizes));
-
-    groups.resize(newNumGroups + 1);
-    exclusiveScanGpu(rawPtr(newGroupSizes), rawPtr(newGroupSizes) + newNumGroups + 1, rawPtr(groups), first);
-    memcpyH2D(&last, 1, rawPtr(groups) + groups.size() - 1);
-}
-
 } // namespace cstone
diff --git a/domain/include/cstone/traversal/groups_gpu.h b/domain/include/cstone/traversal/groups_gpu.h
new file mode 100644
index 000000000..be50bd61c
--- /dev/null
+++ b/domain/include/cstone/traversal/groups_gpu.h
@@ -0,0 +1,90 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*! @file
+ * @brief Spatial target particle grouping
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ */
+
+#pragma once
+
+#include "cstone/cuda/device_vector.h"
+#include "cstone/sfc/box.hpp"
+#include "cstone/traversal/groups.hpp"
+
+namespace cstone
+{
+
+/*! @brief set up fixed-size particle groups
+ * @param[in]  first      first local particle index
+ * @param[in]  last       last local particle index
+ * @param[in]  groupSize  number of particles per group
+ * @param[out] groups     groups with fixed size @p groupSize
+ */
+void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData<GpuTag>& groups);
+
+/*!* @brief Compute groups of particles with a maximum size and distance between consecutive particles limited
+ *
+ * @tparam groupSize                integer
+ * @tparam Tc                       float or double
+ * @tparam T
+ * @tparam KeyType
+ * @param[in]  first                index of first particle in @p x,y,z,h buffers assigned to local domain
+ * @param[in]  last                 index of ilast particle in @p x,y,z,h buffers assigned to local domain
+ * @param[in]  x                    x coordinates
+ * @param[in]  y                    y coordinates
+ * @param[in]  z                    z coordinates
+ * @param[in]  h                    smoothin lengths
+ * @param[in]  leaves               cornerstone leaf array, size is @numLeaves + 1
+ * @param[in]  numLeaves            number of leaves in @p leaves
+ * @param[in]  layout               element i stores the particle index of the first particle of leaf cell i
+ * @param[in]  box                  global coordinate bounding box
+ * @param[in]  groupSize            maximum number of particles per group
+ * @param[in]  tolFactor            tolerance factor for maximum distance between consecutive particles in group
+ * @param[-]   splitMasks           temporary scratch usage
+ * @param[-]   numSplitsPerGroup    temporary scratch usage
+ * @param[out] groups               particle groups stored as indices into coordinate arrays
+ *
+ * This function first creates groups of fixed size @param groupSize. The resulting groups will be split into smaller
+ * groups until no distance between consecutive particles is bigger than @p tolFactor times edge length of the smallest
+ * leaf cell of any particle in the group. Edge length is computed as the cubic root of the cell volume.
+ */
+template<class Tc, class T, class KeyType>
+extern void computeGroupSplits(LocalIndex first,
+                               LocalIndex last,
+                               const Tc* x,
+                               const Tc* y,
+                               const Tc* z,
+                               const T* h,
+                               const KeyType* leaves,
+                               TreeNodeIndex numLeaves,
+                               const LocalIndex* layout,
+                               const Box<Tc> box,
+                               unsigned groupSize,
+                               float tolFactor,
+                               DeviceVector<LocalIndex>& numSplitsPerGroup,
+                               DeviceVector<LocalIndex>& groups);
+
+} // namespace cstone
diff --git a/domain/include/cstone/tree/csarray_gpu.cu b/domain/include/cstone/tree/csarray_gpu.cu
index a3cb6fdca..47c477ff6 100644
--- a/domain/include/cstone/tree/csarray_gpu.cu
+++ b/domain/include/cstone/tree/csarray_gpu.cu
@@ -36,6 +36,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
+#include "cstone/cuda/cuda_runtime.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 #include "cstone/primitives/math.hpp"
 #include "csarray.hpp"
diff --git a/domain/test/integration_mpi/CMakeLists.txt b/domain/test/integration_mpi/CMakeLists.txt
index 5aae73878..e7382b6c2 100644
--- a/domain/test/integration_mpi/CMakeLists.txt
+++ b/domain/test/integration_mpi/CMakeLists.txt
@@ -48,9 +48,6 @@ function(addCstoneGpuMpiTest source exename testname ranks)
         target_compile_definitions(${exename} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
         set_target_properties(${exename} PROPERTIES LINKER_LANGUAGE CXX)
     endif ()
-    if (GPU_DIRECT)
-        target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT)
-    endif ()
 endfunction()
 
 if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
diff --git a/domain/test/integration_mpi/exchange_halos_gpu.cpp b/domain/test/integration_mpi/exchange_halos_gpu.cpp
index f8066394a..0330fd553 100644
--- a/domain/test/integration_mpi/exchange_halos_gpu.cpp
+++ b/domain/test/integration_mpi/exchange_halos_gpu.cpp
@@ -66,7 +66,7 @@ void gpuDirect(int rank)
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
-#ifdef USE_GPU_DIRECT
+#ifdef CSTONE_HAVE_GPU_AWARE_MPI
 TEST(HaloExchange, gpuDirect)
 #else
 TEST(HaloExchange, DISABLED_gpuDirect)
diff --git a/domain/test/performance/CMakeLists.txt b/domain/test/performance/CMakeLists.txt
index e2b4138b9..ed3f47747 100644
--- a/domain/test/performance/CMakeLists.txt
+++ b/domain/test/performance/CMakeLists.txt
@@ -7,15 +7,15 @@ cstone_add_performance_test(scan.cpp scan_perf)
 # only scan.cpp provides some coverage beyond the unit tests
 cstone_add_test(scan_perf EXECUTABLE scan_perf RANKS 1)
 
-if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
+#if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
+if(CMAKE_CUDA_COMPILER) # disabled for HIP
     # uses the OpenMP implementation of thrust::sort_by_key
     cstone_add_performance_test(hilbert.cpp hilbert_perf)
-    target_compile_definitions(hilbert_perf PRIVATE THRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_OMP)
     if (CMAKE_CUDA_COMPILER)
         target_link_libraries(hilbert_perf PRIVATE CUDA::cudart)
     else ()
-        target_compile_definitions(hilbert_perf PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-        target_link_libraries(hilbert_perf PRIVATE hip::host)
+        #set_source_files_properties(hilbert.cpp PROPERTIES LANGUAGE HIP)
+        target_link_libraries(hilbert_perf PRIVATE OpenMP::OpenMP_CXX hip::host)
     endif ()
 endif ()
 
diff --git a/domain/test/performance/neighbor_driver.cu b/domain/test/performance/neighbor_driver.cu
index 3407bb7a9..0cd0da221 100644
--- a/domain/test/performance/neighbor_driver.cu
+++ b/domain/test/performance/neighbor_driver.cu
@@ -38,7 +38,6 @@
 #include "cstone/cuda/cuda_utils.cuh"
 #include "cstone/cuda/thrust_util.cuh"
 #include "cstone/findneighbors.hpp"
-#include "cstone/primitives/math.hpp"
 
 #include "cstone/traversal/find_neighbors.cuh"
 
diff --git a/domain/test/performance/timing.cuh b/domain/test/performance/timing.cuh
index 5f65f670d..bb93fd013 100644
--- a/domain/test/performance/timing.cuh
+++ b/domain/test/performance/timing.cuh
@@ -32,6 +32,7 @@
 #pragma once
 
 #include <chrono>
+#include "cstone/cuda/errorcheck.cuh"
 
 #ifdef __CUDACC__
 
@@ -66,21 +67,21 @@ template<class F>
 float timeGpu(F&& f)
 {
     hipEvent_t start, stop;
-    hipEventCreate(&start);
-    hipEventCreate(&stop);
+    checkGpuErrors(hipEventCreate(&start));
+    checkGpuErrors(hipEventCreate(&stop));
 
-    hipEventRecord(start, hipStreamDefault);
+    checkGpuErrors(hipEventRecord(start, hipStreamDefault));
 
     f();
 
-    hipEventRecord(stop, hipStreamDefault);
-    hipEventSynchronize(stop);
+    checkGpuErrors(hipEventRecord(stop, hipStreamDefault));
+    checkGpuErrors(hipEventSynchronize(stop));
 
     float t0;
-    hipEventElapsedTime(&t0, start, stop);
+    checkGpuErrors(hipEventElapsedTime(&t0, start, stop));
 
-    hipEventDestroy(start);
-    hipEventDestroy(stop);
+    checkGpuErrors(hipEventDestroy(start));
+    checkGpuErrors(hipEventDestroy(stop));
 
     return t0;
 }
@@ -94,4 +95,4 @@ float timeCpu(F&& f)
     f();
     auto t1 = std::chrono::high_resolution_clock::now();
     return std::chrono::duration<float>(t1 - t0).count();
-}
\ No newline at end of file
+}
diff --git a/domain/test/unit_cuda/traversal/groups.cu b/domain/test/unit_cuda/traversal/groups.cu
index ac7ba29d9..824d33a60 100644
--- a/domain/test/unit_cuda/traversal/groups.cu
+++ b/domain/test/unit_cuda/traversal/groups.cu
@@ -1,8 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -38,9 +37,10 @@
 #include <thrust/host_vector.h>
 #include <thrust/sequence.h>
 
-#include "cstone/cuda/device_vector.h"
 #include "cstone/cuda/thrust_util.cuh"
-#include "cstone/traversal/groups.cuh"
+#include "cstone/primitives/math.hpp"
+#include "cstone/traversal/groups_gpu.cuh"
+#include "cstone/traversal/groups_gpu.h"
 #include "cstone/tree/cs_util.hpp"
 
 using namespace cstone;
@@ -51,15 +51,13 @@ using SplitType             = util::array<GpuConfig::ThreadMask, nwt>;
 
 TEST(TargetGroups, t0)
 {
-    using T              = double;
-    LocalIndex numGroups = 4, groupSize = 8, first = 4, last = 34;
+    LocalIndex groupSize = 8, first = 4, last = 34;
 
-    thrust::device_vector<LocalIndex> groups(numGroups + 1);
-    fixedGroupsKernel<<<iceil(last - first, 64), 64>>>(first, last, groupSize, rawPtr(groups),
-                                                       iceil(last - first, groupSize));
+    GroupData<GpuTag> groups;
+    computeFixedGroups(first, last, groupSize, groups);
 
-    thrust::host_vector<LocalIndex> hgroups = groups;
-    thrust::host_vector<LocalIndex> ref     = std::vector<LocalIndex>{4, 12, 20, 28, 34};
+    std::vector<LocalIndex> hgroups = toHost(groups.data);
+    std::vector<LocalIndex> ref{4, 12, 20, 28, 34};
     EXPECT_EQ(hgroups, ref);
 }
 
@@ -276,12 +274,11 @@ TEST(TargetGroups, groupVolumes)
     }
 
     {
-        DeviceVector<SplitType> S;
         DeviceVector<LocalIndex> temp, groups;
 
         float tolFactor = std::sqrt(3.0) / distCrit * 1.01;
-        computeGroupSplits<groupSize>(first, last, rawPtr(x), rawPtr(y), rawPtr(z), rawPtr(h), rawPtr(d_leaves),
-                                      nNodes(leaves), rawPtr(d_layout), box, tolFactor, S, temp, groups);
+        computeGroupSplits(first, last, rawPtr(x), rawPtr(y), rawPtr(z), rawPtr(h), rawPtr(d_leaves), nNodes(leaves),
+                           rawPtr(d_layout), box, groupSize, tolFactor, temp, groups);
 
         std::vector<LocalIndex> h_groups = toHost(groups);
         EXPECT_EQ(h_groups.size(), 4);
diff --git a/main/src/analytical_solutions/CMakeLists.txt b/main/src/analytical_solutions/CMakeLists.txt
index e088a7d97..56370dea3 100644
--- a/main/src/analytical_solutions/CMakeLists.txt
+++ b/main/src/analytical_solutions/CMakeLists.txt
@@ -1,5 +1,8 @@
+# temporary fix for hip build
+if(NOT DEFINED CMAKE_INSTALL_BINDIR)
+  set(CMAKE_INSTALL_BINDIR "bin")
+endif(NOT DEFINED CMAKE_INSTALL_BINDIR)
 
 add_subdirectory(sedov_solution)
 
-install(FILES compare_solutions.py TYPE BIN)
-install(FILES compare_noh.py TYPE BIN)
+install(PROGRAMS compare_noh.py compare_solutions.py DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/main/src/propagator/CMakeLists.txt b/main/src/propagator/CMakeLists.txt
index 2a7a359d2..42be9843e 100644
--- a/main/src/propagator/CMakeLists.txt
+++ b/main/src/propagator/CMakeLists.txt
@@ -13,9 +13,6 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
             ${SPH_DIR} ${RYOANJI_DIR} ${MPI_CXX_INCLUDE_PATH})
     target_link_libraries(propagator_gpu PRIVATE ${MPI_CXX_LIBRARIES} cstone_gpu ryoanji sph_gpu util OpenMP::OpenMP_CXX)
     enableGrackle(propagator_gpu)
-    if (GPU_DIRECT)
-        target_compile_definitions(propagator_gpu PRIVATE USE_GPU_DIRECT)
-    endif ()
 endif ()
 
 if (CMAKE_CUDA_COMPILER)
diff --git a/main/src/sphexa/CMakeLists.txt b/main/src/sphexa/CMakeLists.txt
index 7617764ea..501243d92 100644
--- a/main/src/sphexa/CMakeLists.txt
+++ b/main/src/sphexa/CMakeLists.txt
@@ -16,12 +16,6 @@ function(enableInSituViz exename)
     endif()
 endfunction()
 
-function(enableGpuDirect exename)
-    if(GPU_DIRECT)
-        target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT)
-    endif()
-endfunction()
-
 set(exename sphexa)
 add_executable(${exename} sphexa.cpp)
 target_include_directories(${exename} PRIVATE ${SPH_EXA_INCLUDE_DIRS})
@@ -40,7 +34,6 @@ if(CMAKE_CUDA_COMPILER)
     target_link_libraries(${exename}-cuda PRIVATE ryoanji sph_gpu io sim_init_gpu propagator_gpu observables_gpu OpenMP::OpenMP_CXX
         ${MPI_CXX_LIBRARIES} CUDA::cudart)
     enableInSituViz(${exename}-cuda)
-    enableGpuDirect(${exename}-cuda)
     enableGrackle(${exename}-cuda)
     install(TARGETS ${exename}-cuda RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
@@ -52,7 +45,6 @@ elseif(CMAKE_HIP_COMPILER)
         ${MPI_CXX_LIBRARIES} hip::host)
     set_target_properties(${exename}-hip PROPERTIES LINKER_LANGUAGE CXX)
     enableInSituViz(${exename}-hip)
-    enableGpuDirect(${exename}-hip)
     enableGrackle(${exename}-hip)
     install(TARGETS ${exename}-hip RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
diff --git a/main/src/util/pm_reader.hpp b/main/src/util/pm_reader.hpp
index 81b1b1cbc..052982811 100644
--- a/main/src/util/pm_reader.hpp
+++ b/main/src/util/pm_reader.hpp
@@ -1,3 +1,33 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*! @file
+ * @brief Cray power measurement counter reading
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com
+ */
+
 #pragma once
 
 #include <cstdio>
@@ -33,25 +63,26 @@ class PmReader
     {
     }
 
-    void addCounters(const std::string& pmRoot, int numRanksPerNode_)
+    void addCounters(const std::string& pmRoot, int numRanksPerNode)
     {
-        numRanksPerNode   = numRanksPerNode_;
-        using CounterDesc = std::tuple<std::string, std::string, std::function<bool(int, int)>>;
-        std::vector<CounterDesc> countersToAdd{
-            {"node", pmRoot + "/energy", [](int r, int numRanksPerNode) { return r % numRanksPerNode == 0; }},
-            {"acc", pmRoot + "/accel" + std::to_string(rank_ % numRanksPerNode) + "_energy",
-             [](int, int) { return true; }}};
-
-        for (auto& c : countersToAdd)
+        numRanksPerNode_ = numRanksPerNode;
+        // energy per compute node, only the first rank per node reads the node energy counter
+        {
+            std::string path   = pmRoot + "/energy";
+            bool        enable = (rank_ % numRanksPerNode == 0) && std::filesystem::exists(path);
+            pmCounters.emplace_back("node", path, std::vector<PmType>{}, std::vector<PmType>{}, enable);
+        }
+        // energy per accelerator
         {
-            bool enable = std::filesystem::exists(get<1>(c)) && get<2>(c)(rank_, numRanksPerNode);
-            pmCounters.emplace_back(get<0>(c), get<1>(c), std::vector<PmType>{}, std::vector<PmType>{}, enable);
+            std::string path   = pmRoot + "/accel" + std::to_string(rank_ % numRanksPerNode) + "_energy";
+            bool        enable = std::filesystem::exists(path);
+            pmCounters.emplace_back("acc", path, std::vector<PmType>{}, std::vector<PmType>{}, enable);
         }
     }
 
     void start()
     {
-        numStartCalled++;
+        numStartCalled_++;
         readPm();
     }
 
@@ -83,15 +114,15 @@ class PmReader
 
             ar->addStep(0, pmValues.size(), outFile + ar->suffix());
             ar->stepAttribute("numRanks", &numRanks, 1);
-            ar->stepAttribute("numRanksPerNode", &numRanks, 1);
-            ar->stepAttribute("numIterations", &numStartCalled, 1);
+            ar->stepAttribute("numRanksPerNode", &numRanksPerNode_, 1);
+            ar->stepAttribute("numIterations", &numStartCalled_, 1);
             ar->writeField(pmName, pmValuesRebased.data(), pmValuesRebased.size());
             ar->writeField(pmName + "_timeStamps", pmTimeStampsRebased.data(), pmTimeStampsRebased.size());
             ar->closeStep();
             pmValues.clear();
             pmTimeStamps.clear();
         }
-        numStartCalled = 0;
+        numStartCalled_ = 0;
     }
 
 private:
@@ -107,7 +138,7 @@ class PmReader
         }
     }
 
-    int rank_, numRanksPerNode{0}, numStartCalled{0};
+    int rank_, numRanksPerNode_{0}, numStartCalled_{0};
 
     //                     name         filepath      counter reading      time-stamp reading  enabled
     std::vector<std::tuple<std::string, std::string, std::vector<PmType>, std::vector<PmType>, bool>> pmCounters;
diff --git a/ryoanji/CMakeLists.txt b/ryoanji/CMakeLists.txt
index c16122971..4e17e1782 100644
--- a/ryoanji/CMakeLists.txt
+++ b/ryoanji/CMakeLists.txt
@@ -17,20 +17,35 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-    find_package(CUDAToolkit)
-    set(CMAKE_CUDA_STANDARD 17)
-else()
-    message(STATUS "No CUDA support")
+option(RYOANJI_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
+option(RYOANJI_WITH_HIP "Enable building for AMD GPUs" ON)
+
+if(RYOANJI_WITH_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        find_package(CUDAToolkit)
+        set(CMAKE_CUDA_STANDARD 17)
+    else()
+        message(STATUS "No CUDA support")
+        set(RYOANJI_WITH_CUDA OFF)
+    endif()
 endif()
 
-check_language(HIP)
-if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
-    enable_language(HIP)
-    find_package(hip)
-    set(CMAKE_HIP_STANDARD 17)
+if(RYOANJI_WITH_HIP)
+    check_language(HIP)
+    if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
+        enable_language(HIP)
+        find_package(hip)
+        set(CMAKE_HIP_STANDARD 17)
+    else()
+        message(STATUS "No HIP support")
+        set(RYOANJI_WITH_HIP OFF)
+    endif()
+endif()
+
+if(RYOANJI_WITH_HIP AND RYOANJI_WITH_CUDA)
+    message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
 endif()
 
 add_subdirectory(src)
diff --git a/ryoanji/src/ryoanji/CMakeLists.txt b/ryoanji/src/ryoanji/CMakeLists.txt
index ac8e8fa7e..2d51c3d19 100644
--- a/ryoanji/src/ryoanji/CMakeLists.txt
+++ b/ryoanji/src/ryoanji/CMakeLists.txt
@@ -1,2 +1,3 @@
 
 add_subdirectory(interface)
+add_subdirectory(nbody)
\ No newline at end of file
diff --git a/ryoanji/src/ryoanji/interface/CMakeLists.txt b/ryoanji/src/ryoanji/interface/CMakeLists.txt
index 9abbd1a8e..fb23d9276 100644
--- a/ryoanji/src/ryoanji/interface/CMakeLists.txt
+++ b/ryoanji/src/ryoanji/interface/CMakeLists.txt
@@ -1,13 +1,8 @@
 if(CMAKE_HIP_COMPILER)
-    set_source_files_properties(ewald.cu treebuilder.cu multipole_holder.cu PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(ewald.cu multipole_holder.cu PROPERTIES LANGUAGE HIP)
 endif()
 
 if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
-    add_library(cstone_tree treebuilder.cu)
-    target_include_directories(cstone_tree PUBLIC ${PROJECT_SOURCE_DIR}/src)
-    target_include_directories(cstone_tree PUBLIC ${CSTONE_DIR})
-    target_link_libraries(cstone_tree cstone_gpu)
-
     # workaround for compiling MPI with nvcc on cray systems
     if (MPI_CXX_INCLUDE_PATH STREQUAL "" AND DEFINED ENV{CRAY_MPICH2_DIR} OR DEFINED ENV{CRAY_MPICH_DIR})
         message(STATUS "Applying MPI include workaround on CRAY")
@@ -17,5 +12,5 @@ if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
 
     add_library(ryoanji ewald.cu multipole_holder.cu)
     target_include_directories(ryoanji PUBLIC ${PROJECT_SOURCE_DIR}/src ${CSTONE_DIR} ${MPI_CXX_INCLUDE_PATH})
-    target_link_libraries(ryoanji PUBLIC cstone_gpu ${MPI_CXX_LIBRARIES})
+    target_link_libraries(ryoanji PUBLIC cstone_gpu ryoanji_kernels ${MPI_CXX_LIBRARIES})
 endif()
diff --git a/ryoanji/src/ryoanji/interface/ewald.cu b/ryoanji/src/ryoanji/interface/ewald.cu
index c384fb224..8f7a55a12 100644
--- a/ryoanji/src/ryoanji/interface/ewald.cu
+++ b/ryoanji/src/ryoanji/interface/ewald.cu
@@ -4,8 +4,7 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
-#include <cub/cub.cuh>
-
+#include "cstone/cuda/cub.hpp"
 #include "cstone/traversal/groups.hpp"
 #include "ryoanji/nbody/ewald.hpp"
 
@@ -84,7 +83,7 @@ void computeGravityEwaldGpu(const cstone::Vec3<Tc>& rootCenter, const MType& Mro
     computeGravityEwaldKernel<<<numBlocks, numThreads>>>(grp, x, y, z, m, G, ugrav, ax, ay, az, ewaldParamsGpu);
 
     float totalPotential;
-    checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalEwaldPotentialGlob, sizeof(float)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalEwaldPotentialGlob), sizeof(float)));
     checkGpuErrors(cudaFree(ewaldParamsGpu));
 
     *ugravTot += 0.5 * G * totalPotential;
diff --git a/ryoanji/src/ryoanji/interface/multipole_holder.cu b/ryoanji/src/ryoanji/interface/multipole_holder.cu
index 4e6729258..f2d0ea3f9 100644
--- a/ryoanji/src/ryoanji/interface/multipole_holder.cu
+++ b/ryoanji/src/ryoanji/interface/multipole_holder.cu
@@ -1,8 +1,8 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -34,10 +34,11 @@
 #include "cstone/cuda/cuda_utils.cuh"
 #include "cstone/cuda/thrust_util.cuh"
 #include "cstone/primitives/math.hpp"
+#include "cstone/traversal/groups_gpu.h"
 #include "cstone/util/reallocate.hpp"
 #include "ryoanji/nbody/cartesian_qpole.hpp"
 #include "ryoanji/nbody/direct.cuh"
-#include "ryoanji/nbody/upwardpass.cuh"
+#include "ryoanji/nbody/upsweep_gpu.h"
 #include "ryoanji/nbody/upsweep_cpu.hpp"
 #include "ryoanji/nbody/traversal.cuh"
 #include "multipole_holder.cuh"
@@ -56,13 +57,11 @@ public:
                                    const Th* h, const cstone::FocusedOctree<KeyType, Tf, cstone::GpuTag>& focusTree,
                                    const cstone::LocalIndex* layout, const cstone::Box<Tc>& box)
     {
-        cstone::DeviceVector<util::array<GpuConfig::ThreadMask, TravConfig::nwt>> S;
-
         auto  d_leaves  = focusTree.treeLeavesAcc();
         float tolFactor = 2.0f;
-        cstone::computeGroupSplits<TravConfig::targetSize>(first, last, x, y, z, h, d_leaves.data(),
-                                                           d_leaves.size() - 1, layout, box, tolFactor, S,
-                                                           traversalStack_, groups_.data);
+        cstone::computeGroupSplits(first, last, x, y, z, h, d_leaves.data(), d_leaves.size() - 1, layout, box,
+                                   TravConfig::targetSize, tolFactor, traversalStack_, groups_.data);
+
         groups_.firstBody  = first;
         groups_.lastBody   = last;
         groups_.numGroups  = groups_.data.size() - 1;
@@ -75,9 +74,7 @@ public:
                  const cstone::FocusedOctree<KeyType, Tf, cstone::GpuTag>& focusTree, const cstone::LocalIndex* layout,
                  MType* multipoles)
     {
-        constexpr int numThreads = UpsweepConfig::numThreads;
-        octree_                  = focusTree.octreeViewAcc();
-
+        octree_ = focusTree.octreeViewAcc();
         resize(octree_.numLeafNodes);
 
         auto globalCenters = focusTree.globalExpansionCenters();
@@ -85,9 +82,8 @@ public:
         layout_  = layout;
         centers_ = focusTree.expansionCentersAcc().data();
 
-        computeLeafMultipoles<<<(octree_.numLeafNodes - 1) / numThreads + 1, numThreads>>>(
-            x, y, z, m, octree_.leafToInternal + octree_.numInternalNodes, octree_.numLeafNodes, layout_, centers_,
-            rawPtr(multipoles_));
+        computeLeafMultipoles(x, y, z, m, octree_.leafToInternal + octree_.numInternalNodes, octree_.numLeafNodes,
+                              layout_, centers_, rawPtr(multipoles_));
 
         std::vector<TreeNodeIndex> levelRange(cstone::maxTreeLevel<KeyType>{} + 2);
         memcpyD2H(octree_.levelRange, levelRange.size(), levelRange.data());
@@ -97,11 +93,10 @@ public:
         for (int level = numLevels - 1; level >= 0; level--)
         {
             int numCellsLevel = levelRange[level + 1] - levelRange[level];
-            int numBlocks     = (numCellsLevel - 1) / numThreads + 1;
             if (numCellsLevel)
             {
-                upsweepMultipoles<<<numBlocks, numThreads>>>(levelRange[level], levelRange[level + 1],
-                                                             octree_.childOffsets, centers_, rawPtr(multipoles_));
+                upsweepMultipoles(levelRange[level], levelRange[level + 1], octree_.childOffsets, centers_,
+                                  rawPtr(multipoles_));
             }
         }
 
@@ -122,19 +117,12 @@ public:
         for (int level = numLevels - 1; level >= 0; level--)
         {
             int numCellsLevel = levelRange[level + 1] - levelRange[level];
-            int numBlocks     = (numCellsLevel - 1) / numThreads + 1;
             if (numCellsLevel)
             {
-                upsweepMultipoles<<<numBlocks, numThreads>>>(levelRange[level], levelRange[level + 1],
-                                                             octree_.childOffsets, centers_, rawPtr(multipoles_));
+                upsweepMultipoles(levelRange[level], levelRange[level + 1], octree_.childOffsets, centers_,
+                                  rawPtr(multipoles_));
             }
         }
-
-        if (IsSpherical<MType>{})
-        {
-            normalize<<<cstone::iceil(octree_.numNodes, numThreads), numThreads>>>(octree_.numNodes,
-                                                                                   rawPtr(multipoles_));
-        }
     }
 
     float compute(GroupView grp, const Tc* x, const Tc* y, const Tc* z, const Tm* m, const Th* h, Tc G, int numShells,
@@ -153,14 +141,14 @@ public:
                                                         G, numShells, Vec3<Tc>{box.lx(), box.ly(), box.lz()},
                                                         (Ta*)nullptr, ax, ay, az, (int*)rawPtr(traversalStack_));
         float totalPotential;
-        checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalPotentialGlob, sizeof(float)));
+        checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalPotentialGlob), sizeof(float)));
         return 0.5f * Tc(G) * totalPotential;
     }
 
     util::array<uint64_t, 5> readStats() const
     {
         typename BhStats::type stats[BhStats::numStats];
-        checkGpuErrors(cudaMemcpyFromSymbol(stats, bhStats, BhStats::numStats * sizeof(BhStats::type)));
+        checkGpuErrors(cudaMemcpyFromSymbol(stats, GPU_SYMBOL(bhStats), BhStats::numStats * sizeof(BhStats::type)));
 
         auto sumP2P   = stats[BhStats::sumP2P];
         auto maxP2P   = stats[BhStats::maxP2P];
@@ -247,14 +235,6 @@ const MType* MultipoleHolder<Tc, Th, Tm, Ta, Tf, KeyType, MType>::deviceMultipol
     return impl_->deviceMultipoles();
 }
 
-#define MHOLDER_SPH(Tc, Th, Tm, Ta, Tf, KeyType, MVal)                                                                 \
-    template class MultipoleHolder<Tc, Th, Tm, Ta, Tf, KeyType, SphericalMultipole<MVal, 4>>
-
-MHOLDER_SPH(double, double, double, double, double, uint64_t, double);
-MHOLDER_SPH(double, double, float, double, double, uint64_t, float);
-MHOLDER_SPH(double, float, float, float, double, uint64_t, float);
-MHOLDER_SPH(float, float, float, float, float, uint64_t, float);
-
 #define MHOLDER_CART(Tc, Th, Tm, Ta, Tf, KeyType, MVal)                                                                \
     template class MultipoleHolder<Tc, Th, Tm, Ta, Tf, KeyType, CartesianQuadrupole<MVal>>
 
diff --git a/ryoanji/src/ryoanji/interface/treebuilder.cu b/ryoanji/src/ryoanji/interface/treebuilder.cu
deleted file mode 100644
index dec98f9f5..000000000
--- a/ryoanji/src/ryoanji/interface/treebuilder.cu
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*! @file
- * @brief  Build a tree for Ryoanji with the cornerstone framework
- *
- * @author Sebastian Keller <sebastian.f.keller@gmail.com>
- */
-
-#include <vector>
-
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-#include <thrust/gather.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/cuda/thrust_util.cuh"
-#include "cstone/sfc/sfc_gpu.h"
-#include "cstone/tree/octree_gpu.h"
-#include "cstone/tree/update_gpu.cuh"
-
-#include "ryoanji/nbody/types.h"
-#include "ryoanji/interface/treebuilder.cuh"
-
-namespace ryoanji
-{
-
-template<class KeyType>
-class TreeBuilder<KeyType>::Impl
-{
-public:
-    Impl();
-
-    Impl(unsigned ncrit);
-
-    template<class T>
-    cstone::TreeNodeIndex update(T* x, T* y, T* z, size_t numBodies, const cstone::Box<T>& box);
-
-    int extract(int2* h_levelRange);
-
-    const LocalIndex* layout() const { return rawPtr(d_layout_); }
-
-    const TreeNodeIndex* childOffsets() const { return rawPtr(octreeGpuData_.childOffsets); }
-
-    const TreeNodeIndex* leafToInternal() const
-    {
-        return rawPtr(octreeGpuData_.leafToInternal) + octreeGpuData_.numInternalNodes;
-    }
-
-    const TreeNodeIndex* internalToLeaf() const { return rawPtr(octreeGpuData_.internalToLeaf); }
-
-    TreeNodeIndex numLeafNodes() const { return octreeGpuData_.numLeafNodes; }
-
-private:
-    unsigned bucketSize_;
-
-    thrust::device_vector<KeyType>  d_tree_;
-    thrust::device_vector<unsigned> d_counts_;
-
-    thrust::device_vector<KeyType>               tmpTree_;
-    thrust::device_vector<cstone::TreeNodeIndex> workArray_;
-
-    cstone::OctreeData<KeyType, cstone::GpuTag> octreeGpuData_;
-    thrust::device_vector<cstone::LocalIndex>   d_layout_;
-};
-
-template<class KeyType>
-TreeBuilder<KeyType>::Impl::Impl()
-    : bucketSize_(64)
-{
-}
-
-template<class KeyType>
-TreeBuilder<KeyType>::Impl::Impl(unsigned ncrit)
-    : bucketSize_(ncrit)
-{
-}
-
-template<class KeyType>
-template<class T>
-cstone::TreeNodeIndex TreeBuilder<KeyType>::Impl::update(T* x, T* y, T* z, size_t numBodies,
-                                                         const cstone::Box<T>& csBox)
-{
-    thrust::device_vector<KeyType> d_keys(numBodies);
-    thrust::device_vector<int>     d_ordering(numBodies);
-    thrust::device_vector<T>       tmp(numBodies);
-
-    cstone::computeSfcKeysGpu(x, y, z, cstone::sfcKindPointer(rawPtr(d_keys)), numBodies, csBox);
-
-    thrust::sequence(d_ordering.begin(), d_ordering.end(), 0);
-    thrust::sort_by_key(thrust::device, d_keys.begin(), d_keys.end(), d_ordering.begin());
-
-    thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), x, tmp.begin());
-    thrust::copy(tmp.begin(), tmp.end(), x);
-    thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), y, tmp.begin());
-    thrust::copy(tmp.begin(), tmp.end(), y);
-    thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), z, tmp.begin());
-    thrust::copy(tmp.begin(), tmp.end(), z);
-
-    if (d_tree_.size() == 0)
-    {
-        // initial guess on first call. use previous tree as guess on subsequent calls
-        d_tree_   = std::vector<KeyType>{0, cstone::nodeRange<KeyType>(0)};
-        d_counts_ = std::vector<unsigned>{unsigned(numBodies)};
-    }
-
-    while (!cstone::updateOctreeGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), bucketSize_, d_tree_, d_counts_,
-                                    tmpTree_, workArray_))
-        ;
-
-    octreeGpuData_.resize(cstone::nNodes(d_tree_));
-    cstone::buildOctreeGpu(rawPtr(d_tree_), octreeGpuData_.data());
-
-    return octreeGpuData_.numInternalNodes + octreeGpuData_.numLeafNodes;
-}
-
-template<class KeyType>
-int TreeBuilder<KeyType>::Impl::extract(int2* h_levelRange)
-{
-    d_layout_.resize(d_counts_.size() + 1);
-    thrust::copy(d_counts_.begin(), d_counts_.end(), d_layout_.begin());
-    thrust::exclusive_scan(d_layout_.data(), d_layout_.data() + d_layout_.size(), d_layout_.data());
-
-    std::vector<int> cs_levelRange = toHost(octreeGpuData_.levelRange);
-
-    int numLevels = 0;
-    for (int level = 0; level <= cstone::maxTreeLevel<KeyType>{}; ++level)
-    {
-        if (cs_levelRange[level] == cs_levelRange[level + 1])
-        {
-            numLevels = level - 1;
-            break;
-        }
-
-        h_levelRange[level].x = cs_levelRange[level];
-        h_levelRange[level].y = cs_levelRange[level + 1];
-    }
-
-    return numLevels;
-}
-
-template<class KeyType>
-TreeBuilder<KeyType>::TreeBuilder()
-    : impl_(new Impl())
-{
-}
-
-template<class KeyType>
-TreeBuilder<KeyType>::TreeBuilder(unsigned ncrit)
-    : impl_(new Impl(ncrit))
-{
-}
-
-template<class KeyType>
-TreeBuilder<KeyType>::~TreeBuilder() = default;
-
-template<class KeyType>
-template<class T>
-int TreeBuilder<KeyType>::update(T* x, T* y, T* z, size_t numBodies, const cstone::Box<T>& box)
-{
-    return impl_->update(x, y, z, numBodies, box);
-}
-
-template<class KeyType>
-int TreeBuilder<KeyType>::extract(int2* h_levelRange)
-{
-    return impl_->extract(h_levelRange);
-}
-
-template<class KeyType>
-const LocalIndex* TreeBuilder<KeyType>::layout() const
-{
-    return impl_->layout();
-}
-
-template<class KeyType>
-const TreeNodeIndex* TreeBuilder<KeyType>::childOffsets() const
-{
-    return impl_->childOffsets();
-}
-
-template<class KeyType>
-const TreeNodeIndex* TreeBuilder<KeyType>::leafToInternal() const
-{
-    return impl_->leafToInternal();
-}
-
-template<class KeyType>
-const TreeNodeIndex* TreeBuilder<KeyType>::internalToLeaf() const
-{
-    return impl_->internalToLeaf();
-}
-
-template<class KeyType>
-TreeNodeIndex TreeBuilder<KeyType>::numLeafNodes() const
-{
-    return impl_->numLeafNodes();
-}
-
-template<class KeyType>
-unsigned TreeBuilder<KeyType>::maxTreeLevel() const
-{
-    return cstone::maxTreeLevel<KeyType>{};
-}
-
-template class TreeBuilder<uint32_t>;
-template class TreeBuilder<uint64_t>;
-
-template int TreeBuilder<uint32_t>::update(float*, float*, float*, size_t, const cstone::Box<float>&);
-template int TreeBuilder<uint64_t>::update(float*, float*, float*, size_t, const cstone::Box<float>&);
-template int TreeBuilder<uint32_t>::update(double*, double*, double*, size_t, const cstone::Box<double>&);
-template int TreeBuilder<uint64_t>::update(double*, double*, double*, size_t, const cstone::Box<double>&);
-
-} // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/interface/treebuilder.cuh b/ryoanji/src/ryoanji/interface/treebuilder.cuh
index fce91f02c..9042a0a2c 100644
--- a/ryoanji/src/ryoanji/interface/treebuilder.cuh
+++ b/ryoanji/src/ryoanji/interface/treebuilder.cuh
@@ -31,7 +31,17 @@
 
 #pragma once
 
-#include <memory>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+
+#include "cstone/cuda/cuda_utils.cuh"
+#include "cstone/cuda/thrust_util.cuh"
+#include "cstone/primitives/primitives_gpu.h"
+#include "cstone/sfc/sfc_gpu.h"
+#include "cstone/tree/octree_gpu.h"
+#include "cstone/tree/update_gpu.cuh"
 
 #include "ryoanji/nbody/types.h"
 
@@ -42,12 +52,8 @@ template<class KeyType>
 class TreeBuilder
 {
 public:
-    TreeBuilder();
-
-    ~TreeBuilder();
-
     //! @brief initialize with the desired maximum particles per leaf cell
-    TreeBuilder(unsigned ncrit);
+    TreeBuilder(unsigned ncrit) : bucketSize_(ncrit) {}
 
     /*! @brief construct an octree from body coordinates
      *
@@ -62,29 +68,74 @@ public:
      * Note: x,y,z arrays will be sorted in SFC order to match be consistent with the cell body offsets of the tree
      */
     template<class T>
-    int update(T* x, T* y, T* z, size_t numBodies, const cstone::Box<T>& box);
+    int update(T* x, T* y, T* z, size_t numBodies, const cstone::Box<T>& box)
+    {
+        thrust::device_vector<KeyType> d_keys(numBodies), d_keys_tmp(numBodies);
+        thrust::device_vector<int>     d_ordering(numBodies), d_values_tmp(numBodies);
+        thrust::device_vector<T>       tmp(numBodies);
 
-    /*! @brief extract the octree level range from the previous update call
-     *
-     * @param[out] h_levelRange  indices of the first node at each subdivison level
-     * @return     the maximum subdivision level in the output tree
-     */
-    int extract(int2* h_levelRange);
+        cstone::computeSfcKeysGpu(x, y, z, cstone::sfcKindPointer(rawPtr(d_keys)), numBodies, box);
+
+        cstone::sequenceGpu(rawPtr(d_ordering), d_ordering.size(), 0);
+        cstone::sortByKeyGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), rawPtr(d_ordering), rawPtr(d_keys_tmp),
+                             rawPtr(d_values_tmp));
+
+        thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), x, tmp.begin());
+        thrust::copy(tmp.begin(), tmp.end(), x);
+        thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), y, tmp.begin());
+        thrust::copy(tmp.begin(), tmp.end(), y);
+        thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), z, tmp.begin());
+        thrust::copy(tmp.begin(), tmp.end(), z);
+
+        if (d_tree_.size() == 0)
+        {
+            // initial guess on first call. use previous tree as guess on subsequent calls
+            d_tree_   = std::vector<KeyType>{0, cstone::nodeRange<KeyType>(0)};
+            d_counts_ = std::vector<unsigned>{unsigned(numBodies)};
+        }
+
+        while (!cstone::updateOctreeGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), bucketSize_, d_tree_, d_counts_,
+                                        tmpTree_, workArray_))
+            ;
 
-    const LocalIndex*    layout() const;
-    const TreeNodeIndex* childOffsets() const;
-    const TreeNodeIndex* leafToInternal() const;
-    const TreeNodeIndex* internalToLeaf() const;
+        octreeGpuData_.resize(cstone::nNodes(d_tree_));
+        cstone::buildOctreeGpu(rawPtr(d_tree_), octreeGpuData_.data());
 
-    TreeNodeIndex numLeafNodes() const;
-    unsigned      maxTreeLevel() const;
+        d_layout_.resize(d_counts_.size() + 1);
+        cstone::fillGpu(rawPtr(d_layout_), rawPtr(d_layout_) + 1, LocalIndex(0));
+        cstone::inclusiveScanGpu(rawPtr(d_counts_), rawPtr(d_counts_) + d_counts_.size(), rawPtr(d_layout_) + 1);
+
+        levelRange_host_ = toHost(octreeGpuData_.levelRange);
+        return octreeGpuData_.numInternalNodes + octreeGpuData_.numLeafNodes;
+    }
+
+    const LocalIndex*    layout() const { return rawPtr(d_layout_); }
+    const KeyType*       nodeKeys() const { return rawPtr(octreeGpuData_.prefixes); }
+    const TreeNodeIndex* childOffsets() const { return rawPtr(octreeGpuData_.childOffsets); }
+    const TreeNodeIndex* leafToInternal() const
+    {
+        return cstone::leafToInternal(octreeGpuData_).data();
+    }
+    const TreeNodeIndex* internalToLeaf() const { return rawPtr(octreeGpuData_.internalToLeaf); }
+    //! @brief return host-resident octree level cell ranges
+    const TreeNodeIndex* levelRange() const { return levelRange_host_.data(); }
+
+    TreeNodeIndex numLeafNodes() const { return octreeGpuData_.numLeafNodes; }
+    unsigned      maxTreeLevel() const { return cstone::maxTreeLevel<KeyType>{}; }
 
 private:
-    class Impl;
-    std::unique_ptr<Impl> impl_;
-};
+    unsigned bucketSize_;
 
-extern template class TreeBuilder<uint32_t>;
-extern template class TreeBuilder<uint64_t>;
+    thrust::device_vector<KeyType>  d_tree_;
+    thrust::device_vector<unsigned> d_counts_;
+
+    thrust::device_vector<KeyType>               tmpTree_;
+    thrust::device_vector<cstone::TreeNodeIndex> workArray_;
+
+    cstone::OctreeData<KeyType, cstone::GpuTag> octreeGpuData_;
+    thrust::device_vector<cstone::LocalIndex>   d_layout_;
+
+    std::vector<TreeNodeIndex> levelRange_host_;
+};
 
 } // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/CMakeLists.txt b/ryoanji/src/ryoanji/nbody/CMakeLists.txt
new file mode 100644
index 000000000..eedc0c5ec
--- /dev/null
+++ b/ryoanji/src/ryoanji/nbody/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(CMAKE_HIP_COMPILER)
+    set_source_files_properties(upsweep_gpu.cu PROPERTIES LANGUAGE HIP)
+endif()
+
+if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
+    add_library(ryoanji_kernels upsweep_gpu.cu)
+    target_include_directories(ryoanji_kernels PUBLIC ${PROJECT_SOURCE_DIR}/src ${CSTONE_DIR})
+endif()
diff --git a/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp b/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp
index ecd9c135a..671e955fa 100644
--- a/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp
+++ b/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp
@@ -50,11 +50,6 @@ struct IsCartesian : public stl::integral_constant<size_t, MType{}.size() == Car
 {
 };
 
-template<>
-struct ExpansionOrder<CartesianQuadrupole<float>{}.size()> : stl::integral_constant<size_t, 2>
-{
-};
-
 //! @brief CartesianQuadrupole index names
 struct Cqi
 {
@@ -256,20 +251,4 @@ HOST_DEVICE_FUN void M2M(int begin, int end, const Vec4<T>& Xout, const Vec4<T>*
     }
 }
 
-template<class MType, std::enable_if_t<IsCartesian<MType>{}, int> = 0>
-HOST_DEVICE_FUN MType normalize(const MType& multipole)
-{
-    return multipole;
-}
-
-template<class T>
-std::ostream& operator<<(std::ostream& os, const CartesianQuadrupole<T>& M)
-{
-    os << "m:  " << M[Cqi::mass] << " tr: " << M[Cqi::trace] << std::endl
-       << "xx: " << M[Cqi::qxx] << " xy: " << M[Cqi::qxy] << " xz: " << M[Cqi::qxz] << std::endl
-       << "yy: " << M[Cqi::qyy] << " yz: " << M[Cqi::qyz] << " zz: " << M[Cqi::qzz] << std::endl;
-
-    return os;
-}
-
 } // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/ewald.hpp b/ryoanji/src/ryoanji/nbody/ewald.hpp
index aa9183d3d..f61d633ef 100644
--- a/ryoanji/src/ryoanji/nbody/ewald.hpp
+++ b/ryoanji/src/ryoanji/nbody/ewald.hpp
@@ -213,14 +213,6 @@ EwaldParameters<Tc, Tmm> ewaldInitParameters(const CartesianQuadrupole<Tmm>& Mro
     return params;
 }
 
-template<class Tc, class Tmm>
-EwaldParameters<Tc, Tmm> ewaldInitParameters(const SphericalMultipole<Tmm, 4>& /*Mroot*/,
-                                             const Vec3<Tc>& /*Mroot_center*/, int /*numReplicaShells*/, double /*L*/,
-                                             double /*lCut*/, double /*hCut*/, double /*alpha_scale*/)
-{
-    throw std::runtime_error("Ewald periodic gravity correction not implemented for spherical multipoles\n");
-}
-
 //! @brief real space Ewald contribution to the potential and acceleration of a single particle @p r
 template<class T1, class T2>
 HOST_DEVICE_FUN Vec4<T1> computeEwaldRealSpace(Vec3<T1> r, const EwaldParameters<T1, T2>& params)
diff --git a/ryoanji/src/ryoanji/nbody/kernel.hpp b/ryoanji/src/ryoanji/nbody/kernel.hpp
index d992c888c..eefeef3e7 100644
--- a/ryoanji/src/ryoanji/nbody/kernel.hpp
+++ b/ryoanji/src/ryoanji/nbody/kernel.hpp
@@ -26,7 +26,7 @@
 #pragma once
 
 /*! @file
- * @brief  EXA-FMM multipole kernels
+ * @brief  Particle-2-particle (P2P) kernel
  *
  * @author Rio Yokota <rioyokota@gsic.titech.ac.jp>
  */
@@ -54,453 +54,6 @@ HOST_DEVICE_FUN HOST_DEVICE_INLINE double inverseSquareRoot(double x)
 #endif
 }
 
-template<int nx, int ny, int nz>
-struct Index
-{
-    static constexpr int      I = Index<nx, ny + 1, nz - 1>::I + 1;
-    static constexpr uint64_t F = Index<nx, ny, nz - 1>::F * nz;
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz - 1>::power(dX) * dX[2];
-    }
-};
-
-template<int nx, int ny>
-struct Index<nx, ny, 0>
-{
-    static constexpr int      I = Index<nx + 1, 0, ny - 1>::I + 1;
-    static constexpr uint64_t F = Index<nx, ny - 1, 0>::F * ny;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3<T>& dX)
-    {
-        return Index<nx, ny - 1, 0>::power(dX) * dX[1];
-    }
-};
-
-template<int nx>
-struct Index<nx, 0, 0>
-{
-    static constexpr int      I = Index<0, 0, nx - 1>::I + 1;
-    static constexpr uint64_t F = Index<nx - 1, 0, 0>::F * nx;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3<T>& dX)
-    {
-        return Index<nx - 1, 0, 0>::power(dX) * dX[0];
-    }
-};
-
-template<>
-struct Index<0, 0, 0>
-{
-    static constexpr int      I = 0;
-    static constexpr uint64_t F = 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3<T>&)
-    {
-        return T(1.0);
-    }
-};
-
-template<int n>
-struct DerivativeTerm
-{
-    static constexpr int c = 1 - 2 * n;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE void invR(T* invRN, const T& invR2)
-    {
-        DerivativeTerm<n - 1>::invR(invRN, invR2);
-        invRN[n] = c * invRN[n - 1] * invR2;
-    }
-};
-
-template<>
-struct DerivativeTerm<0>
-{
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE void invR(T*, const T&)
-    {
-    }
-};
-
-template<int depth, int nx, int ny, int nz, int flag>
-struct DerivativeSum
-{
-    static constexpr int cx = nx * (nx - 1) / 2;
-    static constexpr int cy = ny * (ny - 1) / 2;
-    static constexpr int cz = nz * (nz - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cx * DerivativeSum<depth + 1, nx - 2, ny, nz, (nx > 3) * 4 + 3>::loop(invRN, dX) +
-               cy * DerivativeSum<depth + 1, nx, ny - 2, nz, (ny > 3) * 2 + 5>::loop(invRN, dX) +
-               cz * DerivativeSum<depth + 1, nx, ny, nz - 2, (nz > 3) + 6>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 6>
-{
-    static constexpr int cx = nx * (nx - 1) / 2;
-    static constexpr int cy = ny * (ny - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cx * DerivativeSum<depth + 1, nx - 2, ny, nz, (nx > 3) * 4 + 2>::loop(invRN, dX) +
-               cy * DerivativeSum<depth + 1, nx, ny - 2, nz, (ny > 3) * 2 + 4>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 5>
-{
-    static constexpr int cx = nx * (nx - 1) / 2;
-    static constexpr int cz = nz * (nz - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cx * DerivativeSum<depth + 1, nx - 2, ny, nz, (nx > 3) * 4 + 1>::loop(invRN, dX) +
-               cz * DerivativeSum<depth + 1, nx, ny, nz - 2, (nz > 3) + 4>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 4>
-{
-    static constexpr int cx = nx * (nx - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cx * DerivativeSum<depth + 1, nx - 2, ny, nz, (nx > 3) * 4>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 3>
-{
-    static constexpr int cy = ny * (ny - 1) / 2;
-    static constexpr int cz = nz * (nz - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cy * DerivativeSum<depth + 1, nx, ny - 2, nz, (ny > 3) * 2 + 1>::loop(invRN, dX) +
-               cz * DerivativeSum<depth + 1, nx, ny, nz - 2, (nz > 3) + 2>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 2>
-{
-    static constexpr int cy = ny * (ny - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cy * DerivativeSum<depth + 1, nx, ny - 2, nz, (ny > 3) * 2>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 1>
-{
-    static constexpr int cz = nz * (nz - 1) / 2;
-    static constexpr int n  = nx + ny + nz;
-    static constexpr int d  = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d +
-               cz * DerivativeSum<depth + 1, nx, ny, nz - 2, (nz > 3)>::loop(invRN, dX);
-    }
-};
-
-template<int depth, int nx, int ny, int nz>
-struct DerivativeSum<depth, nx, ny, nz, 0>
-{
-    static constexpr int n = nx + ny + nz;
-    static constexpr int d = depth > 0 ? depth : 1;
-
-    template<class T>
-    static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3<T>& dX)
-    {
-        return Index<nx, ny, nz>::power(dX) * invRN[n + depth] / d;
-    }
-};
-
-template<int nx, int ny, int nz, int kx = nx, int ky = ny, int kz = nz>
-struct MultipoleSum
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3<T>& dX, const MType& M)
-    {
-        return MultipoleSum<nx, ny, nz, kx, ky, kz - 1>::kernel(dX, M) + Index<nx - kx, ny - ky, nz - kz>::power(dX) /
-                                                                             Index<nx - kx, ny - ky, nz - kz>::F *
-                                                                             M[Index<kx, ky, kz>::I];
-    }
-};
-
-template<int nx, int ny, int nz, int kx, int ky>
-struct MultipoleSum<nx, ny, nz, kx, ky, 0>
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3<T>& dX, const MType& M)
-    {
-        return MultipoleSum<nx, ny, nz, kx, ky - 1, nz>::kernel(dX, M) +
-               Index<nx - kx, ny - ky, nz>::power(dX) / Index<nx - kx, ny - ky, nz>::F * M[Index<kx, ky, 0>::I];
-    }
-};
-
-template<int nx, int ny, int nz, int kx>
-struct MultipoleSum<nx, ny, nz, kx, 0, 0>
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3<T>& dX, const MType& M)
-    {
-        return MultipoleSum<nx, ny, nz, kx - 1, ny, nz>::kernel(dX, M) +
-               Index<nx - kx, ny, nz>::power(dX) / Index<nx - kx, ny, nz>::F * M[Index<kx, 0, 0>::I];
-    }
-};
-
-template<int nx, int ny, int nz>
-struct MultipoleSum<nx, ny, nz, 0, 0, 0>
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3<T>& dX, const MType& M)
-    {
-        return Index<nx, ny, nz>::power(dX) / Index<nx, ny, nz>::F * M[Index<0, 0, 0>::I];
-    }
-};
-
-template<int nx, int ny, int nz>
-struct Kernels
-{
-    static constexpr int n    = nx + ny + nz;
-    static constexpr int x    = nx > 0;
-    static constexpr int y    = ny > 0;
-    static constexpr int z    = nz > 0;
-    static constexpr int flag = (nx > 1) * 4 + (ny > 1) * 2 + (nz > 1);
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3<T>& dX)
-    {
-        Kernels<nx, ny + 1, nz - 1>::P2M(M, dX);
-        M[Index<nx, ny, nz>::I] = Index<nx, ny, nz>::power(dX) / Index<nx, ny, nz>::F * M[0];
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3<T>& dX, const MType& MJ)
-    {
-        Kernels<nx, ny + 1, nz - 1>::M2M(MI, dX, MJ);
-        MI[Index<nx, ny, nz>::I] += MultipoleSum<nx, ny, nz>::kernel(dX, MJ);
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4<T>& TRG, T* invRN, const Vec3<T>& dX, const MType& M)
-    {
-        Kernels<nx, ny + 1, nz - 1>::M2P(TRG, invRN, dX, M);
-        T C = DerivativeSum<0, nx, ny, nz, flag>::loop(invRN, dX);
-        TRG[0] -= M[Index<nx, ny, nz>::I] * C;
-        TRG[1] += M[Index<(nx - 1) * x, ny, nz>::I] * C * x;
-        TRG[2] += M[Index<nx, (ny - 1) * y, nz>::I] * C * y;
-        TRG[3] += M[Index<nx, ny, (nz - 1) * z>::I] * C * z;
-    }
-};
-
-template<int nx, int ny>
-struct Kernels<nx, ny, 0>
-{
-    static constexpr int n    = nx + ny;
-    static constexpr int x    = nx > 0;
-    static constexpr int y    = ny > 0;
-    static constexpr int flag = (nx > 1) * 4 + (ny > 1) * 2;
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3<T>& dX)
-    {
-        Kernels<nx + 1, 0, ny - 1>::P2M(M, dX);
-        M[Index<nx, ny, 0>::I] = Index<nx, ny, 0>::power(dX) / Index<nx, ny, 0>::F * M[0];
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3<T>& dX, const MType& MJ)
-    {
-        Kernels<nx + 1, 0, ny - 1>::M2M(MI, dX, MJ);
-        MI[Index<nx, ny, 0>::I] += MultipoleSum<nx, ny, 0>::kernel(dX, MJ);
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4<T>& TRG, T* invRN, const Vec3<T>& dX, const MType& M)
-    {
-        Kernels<nx + 1, 0, ny - 1>::M2P(TRG, invRN, dX, M);
-        T C = DerivativeSum<0, nx, ny, 0, flag>::loop(invRN, dX);
-        TRG[0] -= M[Index<nx, ny, 0>::I] * C;
-        TRG[1] += M[Index<(nx - 1) * x, ny, 0>::I] * C * x;
-        TRG[2] += M[Index<nx, (ny - 1) * y, 0>::I] * C * y;
-    }
-};
-
-template<int nx>
-struct Kernels<nx, 0, 0>
-{
-    static constexpr int n    = nx;
-    static constexpr int flag = (nx > 1) * 4;
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3<T>& dX)
-    {
-        Kernels<0, 0, nx - 1>::P2M(M, dX);
-        M[Index<nx, 0, 0>::I] = Index<nx, 0, 0>::power(dX) / Index<nx, 0, 0>::F * M[0];
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3<T>& dX, const MType& MJ)
-    {
-        Kernels<0, 0, nx - 1>::M2M(MI, dX, MJ);
-        MI[Index<nx, 0, 0>::I] += MultipoleSum<nx, 0, 0>::kernel(dX, MJ);
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4<T>& TRG, T* invRN, const Vec3<T> dX, const MType& M)
-    {
-        Kernels<0, 0, nx - 1>::M2P(TRG, invRN, dX, M);
-        const float C = DerivativeSum<0, nx, 0, 0, flag>::loop(invRN, dX);
-        TRG[0] -= M[Index<nx, 0, 0>::I] * C;
-        TRG[1] += M[Index<nx - 1, 0, 0>::I] * C;
-    }
-};
-
-template<>
-struct Kernels<0, 0, 2>
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3<T>& dX)
-    {
-        Kernels<0, 1, 1>::P2M(M, dX);
-        M[Index<0, 0, 2>::I] = Index<0, 0, 2>::power(dX) / Index<0, 0, 2>::F * M[0];
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3<T>& dX, const MType& MJ)
-    {
-        Kernels<0, 1, 1>::M2M(MI, dX, MJ);
-        MI[Index<0, 0, 2>::I] += MultipoleSum<0, 0, 2>::kernel(dX, MJ);
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4<T>& TRG, T* invRN, const Vec3<T>& dX, const MType& M)
-    {
-        TRG[0] -= invRN[0] + invRN[1] * (M[4] + M[7] + M[9]) +
-                  invRN[2] * (M[4] * dX[0] * dX[0] + M[5] * dX[0] * dX[1] + M[6] * dX[0] * dX[2] +
-                              M[7] * dX[1] * dX[1] + M[8] * dX[1] * dX[2] + M[9] * dX[2] * dX[2]);
-        TRG[1] += dX[0] * invRN[1];
-        TRG[2] += dX[1] * invRN[1];
-        TRG[3] += dX[2] * invRN[1];
-    }
-};
-
-template<>
-struct Kernels<0, 0, 0>
-{
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& /*M*/, const Vec3<T>& /*dX*/)
-    {
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3<T>& dX, const MType& MJ)
-    {
-        MI[Index<0, 0, 0>::I] += MultipoleSum<0, 0, 0>::kernel(dX, MJ);
-    }
-
-    template<class T, class MType>
-    static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4<T>& TRG, T* invRN, const Vec3<T>&, const MType&)
-    {
-        TRG[0] -= invRN[0];
-    }
-};
-
-/*! @brief calculate multipole from particles
- *
- * @param[in]  begin    first particle index of bodyPos
- * @param[in]  end      last particles index of bodyPos
- * @param[in]  Xout     expansion (com) center of the output multipole
- * @param[in]  bodyPos  body position and mass
- * @param[out] Mout     output multipole to add contributions to
- *
- */
-template<class T, class Tm, class MType, std::enable_if_t<IsSpherical<MType>{}, int> = 0>
-HOST_DEVICE_FUN DEVICE_INLINE void P2M(const T* x, const T* y, const T* z, const Tm* m, int begin, int end,
-                                       const Vec4<T>& Xout, MType& Mout)
-{
-    constexpr int P = ExpansionOrder<MType{}.size()>{};
-
-    Mout = 0;
-    for (int i = begin; i < end; i++)
-    {
-        Vec4<T> body = {x[i], y[i], z[i], T(m[i])};
-        Vec3<T> dX   = makeVec3(Xout - body);
-        MType   M;
-        M[0] = body[3];
-        Kernels<0, 0, P - 1>::P2M(M, dX);
-        Mout += M;
-    }
-}
-
-/*! @brief Combine multipoles into a single multipole
- *
- * @tparam      T        float or double
- * @tparam      MType    Spherical multipole, quadrupole or octopole
- * @param[in]   begin    first index into @p sourceCenter and @p Multipole to aggregate
- * @param[in]   end      last index
- * @param[in]   Xout     the expansion (com) center of the output multipole
- * @param[in]   Xsrc     input multipole expansion (com) centers
- * @param[in]   Msrc     input multipoles
- * @param[out]  Mout     the aggregated output multipole
- */
-template<class T, class MType, std::enable_if_t<IsSpherical<MType>{}, int> = 0>
-HOST_DEVICE_FUN DEVICE_INLINE void M2M(int begin, int end, const Vec4<T>& Xout, const Vec4<T>* Xsrc, const MType* Msrc,
-                                       MType& Mout)
-{
-    constexpr int P = ExpansionOrder<MType{}.size()>{};
-
-    Mout = 0;
-    for (int i = begin; i < end; i++)
-    {
-        const MType& Mi = Msrc[i];
-        Vec4<T>      Xi = Xsrc[i];
-        Vec3<T>      dX = makeVec3(Xout - Xi);
-        Kernels<0, 0, P - 1>::M2M(Mout, dX, Mi);
-    }
-}
-
 /*! @brief interaction between two particles
  *
  * @param acc     acceleration to add to
@@ -534,101 +87,4 @@ HOST_DEVICE_FUN DEVICE_INLINE Vec4<Ta> P2P(Vec4<Ta> acc, const Vec3<Tc>& pos_i,
     return acc;
 }
 
-/*! @brief apply a spherial multipole to a particle
- *
- * @param acc     acceleration to add to
- * @param pos_i   target particle coordinate
- * @param pos_j   multipole source coordinate
- * @param M       the multipole
- * @param EPS2    plummer softening parameter
- * @return        input acceleration plus contribution from this call
- */
-template<class Ta, class Tc, class MType, std::enable_if_t<IsSpherical<MType>{}, int> = 0>
-HOST_DEVICE_FUN DEVICE_INLINE Vec4<Ta> M2P(Vec4<Ta> acc, const Vec3<Tc>& pos_i, const Vec3<Tc>& pos_j, MType& M)
-{
-    constexpr int P = ExpansionOrder<MType{}.size()>{};
-
-    Vec3<Tc> dX    = pos_i - pos_j;
-    Tc       R2    = norm2(dX);
-    Tc       invR  = inverseSquareRoot(R2);
-    Tc       invR2 = invR * invR;
-
-    Tc invRN[P];
-    invRN[0] = M[0] * invR;
-    DerivativeTerm<P - 1>::invR(invRN, invR2);
-
-    auto M0 = M[0];
-    M[0]    = 1;
-    Kernels<0, 0, P - 1>::M2P(acc, invRN, dX, M);
-    M[0] = M0;
-
-    return acc;
-}
-
-//! @brief computes the center of mass for the bodies in the specified range
-template<class T>
-HOST_DEVICE_FUN DEVICE_INLINE Vec4<T> setCenter(const int begin, const int end, const T* x, const T* y, const T* z,
-                                                const T* m)
-{
-    assert(begin <= end);
-
-    Vec4<T> center{0, 0, 0, 0};
-    for (int i = begin; i < end; i++)
-    {
-        T weight = m[i];
-
-        center[0] += weight * x[i];
-        center[1] += weight * y[i];
-        center[2] += weight * z[i];
-        center[3] += weight;
-    }
-
-    T invM = (center[3] != 0.0f) ? 1.0f / center[3] : 0.0f;
-    center[0] *= invM;
-    center[1] *= invM;
-    center[2] *= invM;
-
-    return center;
-}
-
-//! @brief computes the center of mass for the bodies in the specified range
-template<class T>
-HOST_DEVICE_FUN DEVICE_INLINE Vec4<T> setCenter(const int begin, const int end, const Vec4<T>* posGlob)
-{
-    assert(begin <= end);
-
-    Vec4<T> center{0, 0, 0, 0};
-    for (int i = begin; i < end; i++)
-    {
-        Vec4<T> pos    = posGlob[i];
-        T       weight = pos[3];
-
-        center[0] += weight * pos[0];
-        center[1] += weight * pos[1];
-        center[2] += weight * pos[2];
-        center[3] += weight;
-    }
-
-    T invM = (center[3] != 0.0f) ? 1.0f / center[3] : 0.0f;
-    center[0] *= invM;
-    center[1] *= invM;
-    center[2] *= invM;
-
-    return center;
-}
-
-template<class MType, std::enable_if_t<IsSpherical<MType>{}, int> = 0>
-HOST_DEVICE_FUN MType normalize(const MType& multipole)
-{
-    using T = typename MType::value_type;
-    MType M = multipole;
-
-    T mass = M[0];
-    T invM = (mass != T(0.0)) ? T(1.0) / mass : T(0.0);
-    M *= invM;
-    M[0] = mass;
-
-    return M;
-}
-
 } // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/traversal.cuh b/ryoanji/src/ryoanji/nbody/traversal.cuh
index cbdf9c1ce..a22c3cf66 100644
--- a/ryoanji/src/ryoanji/nbody/traversal.cuh
+++ b/ryoanji/src/ryoanji/nbody/traversal.cuh
@@ -35,7 +35,7 @@
 #include <algorithm>
 #include "cstone/cuda/gpu_config.cuh"
 #include "cstone/primitives/warpscan.cuh"
-#include "cstone/traversal/groups.cuh"
+#include "cstone/traversal/groups.hpp"
 #include "cartesian_qpole.hpp"
 #include "kernel.hpp"
 
@@ -563,79 +563,4 @@ __global__ __launch_bounds__(TravConfig::numThreads) void traverse(
     }
 }
 
-/*! @brief Compute approximate body accelerations with Barnes-Hut
- *
- * @param[in]    firstBody      index of first body in @p bodyPos to compute acceleration for
- * @param[in]    lastBody       index (exclusive) of last body in @p bodyPos to compute acceleration for
- * @param[in]    x,y,z,m,h      bodies, in SFC order and as referenced by sourceCells
- * @param[in]    G              gravitational constant
- * @param[in]    numShells      number of periodic shells in each dimension to include
- * @param[in]    box            coordinate bounding box
- * @param[inout] p              body potential to add to, on device
- * @param[inout] ax,ay,az       body acceleration to add to
- * @param[in]    childOffsets   location (index in [0:numTreeNodes]) of first child of each cell, 0 indicates a leaf
- * @param[in]    internalToLeaf for each cell in [0:numTreeNodes], stores the leaf cell (cstone) index in [0:numLeaves]
- *                              if the cell is not a leaf, the value is negative
- * @param[in]    layout         for each leaf cell in [0:numLeaves], stores the index of the first body in the cell
- * @param[in]    sourceCenter   x,y,z center and square MAC radius of each cell in [0:numTreeNodes]
- * @param[in]    Multipole      cell multipoles, on device
- * @return                      P2P and M2P interaction statistics
- */
-template<class Tc, class Th, class Tm, class Ta, class Tf, class MType>
-auto computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z, const Tm* m,
-                         const Th* h, Tc G, int numShells, const cstone::Box<Tc>& box, Ta* p, Ta* ax, Tc* ay, Tc* az,
-                         const TreeNodeIndex* childOffsets, const TreeNodeIndex* internalToLeaf,
-                         const LocalIndex* layout, const Vec4<Tf>* sourceCenter, const MType* Multipole)
-{
-    constexpr int numWarpsPerBlock = TravConfig::numThreads / GpuConfig::warpSize;
-
-    cstone::GroupData<cstone::GpuTag> groups;
-    cstone::computeFixedGroups(firstBody, lastBody, TravConfig::targetSize, groups);
-
-    LocalIndex numBodies  = lastBody - firstBody;
-    int        numTargets = (numBodies - 1) / TravConfig::targetSize + 1;
-    int        numBlocks  = (numTargets - 1) / numWarpsPerBlock + 1;
-    numBlocks             = std::min(numBlocks, TravConfig::maxNumActiveBlocks);
-
-    printf("launching %d blocks\n", numBlocks);
-
-    const int                  poolSize = TravConfig::memPerWarp * numWarpsPerBlock * numBlocks;
-    thrust::device_vector<int> globalPool(poolSize);
-
-    resetTraversalCounters<<<1, 1>>>();
-    auto t0 = std::chrono::high_resolution_clock::now();
-    traverse<<<numBlocks, TravConfig::numThreads>>>(
-        groups.view(), 1, x, y, z, m, h, childOffsets, internalToLeaf, layout, sourceCenter, Multipole, G, numShells,
-        {box.lx(), box.ly(), box.lz()}, p, ax, ay, az, thrust::raw_pointer_cast(globalPool.data()));
-    kernelSuccess("traverse");
-
-    auto   t1 = std::chrono::high_resolution_clock::now();
-    double dt = std::chrono::duration<double>(t1 - t0).count();
-
-    typename BhStats::type stats[BhStats::numStats];
-    checkGpuErrors(cudaMemcpyFromSymbol(stats, bhStats, BhStats::numStats * sizeof(BhStats::type)));
-
-    auto sumP2P = stats[BhStats::sumP2P];
-    auto maxP2P = stats[BhStats::maxP2P];
-    auto sumM2P = stats[BhStats::sumM2P];
-    auto maxM2P = stats[BhStats::maxM2P];
-
-    float totalPotential;
-    checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalPotentialGlob, sizeof(float)));
-
-    util::array<Tc, 5> interactions;
-    interactions[0] = Tc(sumP2P) / Tc(numBodies);
-    interactions[1] = Tc(maxP2P);
-    interactions[2] = Tc(sumM2P) / Tc(numBodies);
-    interactions[3] = Tc(maxM2P);
-    interactions[4] = totalPotential;
-
-    Tc flops = (interactions[0] * 20.0 + interactions[2] * 2.0 * powf(ExpansionOrder<MType{}.size()>{}, 3)) *
-               Tc(numBodies) / dt / 1e12;
-
-    fprintf(stdout, "Traverse             : %.7f s (%.7f TFlops)\n", dt, flops);
-
-    return interactions;
-}
-
 } // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/types.h b/ryoanji/src/ryoanji/nbody/types.h
index aca3acfa8..89e152b45 100644
--- a/ryoanji/src/ryoanji/nbody/types.h
+++ b/ryoanji/src/ryoanji/nbody/types.h
@@ -24,7 +24,7 @@
  */
 
 /*! @file
- * @brief  Ryoanji multipole and related types
+ * @brief  helper types
  *
  * @author Rio Yokota <rioyokota@gsic.titech.ac.jp>
  */
@@ -47,43 +47,4 @@ using Vec4 = cstone::Vec4<T>;
 using TreeNodeIndex = cstone::TreeNodeIndex;
 using LocalIndex    = cstone::LocalIndex;
 
-template<size_t P>
-struct TermSize : public stl::integral_constant<size_t, P*(P + 1) * (P + 2) / 6>
-{
-};
-
-template<class T, size_t P>
-using SphericalMultipole = util::array<T, TermSize<P>{}>;
-
-template<class MType>
-struct IsSpherical
-    : public stl::integral_constant<size_t, MType{}.size() == TermSize<2>{} || MType{}.size() == TermSize<4>{}>
-{
-};
-
-template<int ArraySize, class = void>
-struct ExpansionOrder
-{
-};
-
-template<>
-struct ExpansionOrder<TermSize<1>{}> : stl::integral_constant<size_t, 1>
-{
-};
-
-template<>
-struct ExpansionOrder<TermSize<2>{}> : stl::integral_constant<size_t, 2>
-{
-};
-
-template<>
-struct ExpansionOrder<TermSize<3>{}> : stl::integral_constant<size_t, 3>
-{
-};
-
-template<>
-struct ExpansionOrder<TermSize<4>{}> : stl::integral_constant<size_t, 4>
-{
-};
-
 } // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu b/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu
new file mode 100644
index 000000000..b4c970a90
--- /dev/null
+++ b/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu
@@ -0,0 +1,115 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*! @file
+ * @brief  Upsweep for multipole and source center computation
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ */
+
+#include "cstone/cuda/cuda_runtime.hpp"
+#include "cstone/primitives/math.hpp"
+
+#include "ryoanji/nbody/cartesian_qpole.hpp"
+#include "ryoanji/nbody/kernel.hpp"
+
+#include "upsweep_gpu.h"
+
+namespace ryoanji
+{
+
+struct UpsweepConfig
+{
+    static constexpr int numThreads = 256;
+};
+
+template<class Tc, class Tm, class Tf, class MType>
+__global__ void computeLeafMultipolesKernel(const Tc* x, const Tc* y, const Tc* z, const Tm* m,
+                                            const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves,
+                                            const LocalIndex* layout, const Vec4<Tf>* centers, MType* multipoles)
+{
+    unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (leafIdx < numLeaves)
+    {
+        TreeNodeIndex i = leafToInternal[leafIdx];
+        P2M(x, y, z, m, layout[leafIdx], layout[leafIdx + 1], centers[i], multipoles[i]);
+    }
+}
+
+template<class Tc, class Tm, class Tf, class MType>
+void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m, const TreeNodeIndex* leafToInternal,
+                           TreeNodeIndex numLeaves, const LocalIndex* layout, const Vec4<Tf>* centers,
+                           MType* multipoles)
+{
+    constexpr int numThreads = UpsweepConfig::numThreads;
+    if (numLeaves)
+    {
+        computeLeafMultipolesKernel<<<cstone::iceil(numLeaves, numThreads), numThreads>>>(
+            x, y, z, m, leafToInternal, numLeaves, layout, centers, multipoles);
+    }
+}
+
+#define COMPUTE_LEAF_MULTIPOLES(Tc, Tm, Tf, MType)                                                                     \
+    template void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m,                            \
+                                        const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves,                  \
+                                        const LocalIndex* layout, const Vec4<Tf>* centers, MType* multipoles)
+
+COMPUTE_LEAF_MULTIPOLES(double, double, double, CartesianQuadrupole<double>);
+COMPUTE_LEAF_MULTIPOLES(double, float, double, CartesianQuadrupole<float>);
+COMPUTE_LEAF_MULTIPOLES(float, float, float, CartesianQuadrupole<float>);
+
+template<class T, class MType>
+__global__ void upsweepMultipolesKernel(TreeNodeIndex firstCell, TreeNodeIndex lastCell,
+                                        const TreeNodeIndex* childOffsets, const Vec4<T>* centers, MType* multipoles)
+{
+    const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell;
+    if (cellIdx >= lastCell) return;
+
+    TreeNodeIndex firstChild = childOffsets[cellIdx];
+
+    // firstChild is zero if the cell is a leaf
+    if (firstChild) { M2M(firstChild, firstChild + 8, centers[cellIdx], centers, multipoles, multipoles[cellIdx]); }
+}
+
+template<class T, class MType>
+void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets,
+                       const Vec4<T>* centers, MType* multipoles)
+{
+    constexpr int numThreads = UpsweepConfig::numThreads;
+    if (lastCell > firstCell)
+    {
+        upsweepMultipolesKernel<<<cstone::iceil(lastCell - firstCell, numThreads), numThreads>>>(
+            firstCell, lastCell, childOffsets, centers, multipoles);
+    }
+}
+
+#define UPSWEEP_MULTIPOLES(T, MType)                                                                                   \
+    template void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell,                                   \
+                                    const TreeNodeIndex* childOffsets, const Vec4<T>* centers, MType* multipoles)
+
+UPSWEEP_MULTIPOLES(double, CartesianQuadrupole<double>);
+UPSWEEP_MULTIPOLES(double, CartesianQuadrupole<float>);
+UPSWEEP_MULTIPOLES(float, CartesianQuadrupole<float>);
+
+} // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/upsweep_gpu.h b/ryoanji/src/ryoanji/nbody/upsweep_gpu.h
new file mode 100644
index 000000000..6f379fb6d
--- /dev/null
+++ b/ryoanji/src/ryoanji/nbody/upsweep_gpu.h
@@ -0,0 +1,58 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*! @file
+ * @brief  Upsweep for multipole and source center computation
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ */
+
+#pragma once
+
+#include "ryoanji/nbody/types.h"
+#include "kernel.hpp"
+
+namespace ryoanji
+{
+
+template<class Tc, class Tm, class Tf, class MType>
+extern void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m,
+                                  const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves,
+                                  const LocalIndex* layout, const Vec4<Tf>* centers, MType* multipoles);
+
+/*! @brief perform multipole upward sweep for one tree level
+ *
+ * launch config: one thread per cell of the current level
+ *
+ * @param[in]  firstCell        first cell to process
+ * @param[in]  lastCell         last cell to process
+ * @param[in]  childOffsets     cell index of first child of each node
+ * @param[in]  centers          source expansion (mass) centers
+ * @param[out] multipoles       output multipole of each cell
+ */
+template<class T, class MType>
+extern void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets,
+                              const Vec4<T>* centers, MType* multipoles);
+
+} // namespace ryoanji
diff --git a/ryoanji/src/ryoanji/nbody/upwardpass.cuh b/ryoanji/src/ryoanji/nbody/upwardpass.cuh
deleted file mode 100644
index 3dd145ded..000000000
--- a/ryoanji/src/ryoanji/nbody/upwardpass.cuh
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*! @file
- * @brief  Upsweep for multipole and source center computation
- *
- * @author Rio Yokota <rioyokota@gsic.titech.ac.jp>
- * @author Sebastian Keller <sebastian.f.keller@gmail.com>
- */
-
-#pragma once
-
-#include <chrono>
-
-#include "cstone/cuda/gpu_config.cuh"
-
-#include "kernel.hpp"
-
-namespace ryoanji
-{
-
-struct UpsweepConfig
-{
-    static constexpr int numThreads = 256;
-};
-
-template<class Tc, class Tm, class Tf, class MType>
-__global__ void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m,
-                                      const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves,
-                                      const LocalIndex* layout, const Vec4<Tf>* centers, MType* multipoles)
-{
-    unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (leafIdx < numLeaves)
-    {
-        TreeNodeIndex i = leafToInternal[leafIdx];
-        P2M(x, y, z, m, layout[leafIdx], layout[leafIdx + 1], centers[i], multipoles[i]);
-    }
-}
-
-/*! @brief perform multipole upward sweep for one tree level
- *
- * launch config: one thread per cell of the current level
- *
- * @param[in]  firstCell        first cell to process
- * @param[in]  lastCell         last cell to process
- * @param[in]  childOffsets     cell index of first child of each node
- * @param[in]  centers          source expansion (mass) centers
- * @param[out] multipoles       output multipole of each cell
- */
-template<class T, class MType>
-__global__ void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets,
-                                  const Vec4<T>* centers, MType* multipoles)
-{
-    const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell;
-    if (cellIdx >= lastCell) return;
-
-    TreeNodeIndex firstChild = childOffsets[cellIdx];
-
-    // firstChild is zero if the cell is a leaf
-    if (firstChild) { M2M(firstChild, firstChild + 8, centers[cellIdx], centers, multipoles, multipoles[cellIdx]); }
-}
-
-template<class Tc, class Tm, class Th, class Tf>
-__global__ void computeLeafCenters(const Tc* x, const Tc* y, const Tc* z, const Tm* m, const Th* h,
-                                   const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves,
-                                   const LocalIndex* layout, Vec4<Tf>* centers, Vec4<Tc>* cellXmin, Vec4<Tc>* cellXmax)
-{
-    unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const Tc huge = Tc(1e10);
-    Vec4<Tc> Xmin{+huge, +huge, +huge, +huge};
-    Vec4<Tc> Xmax{-huge, -huge, -huge, -huge};
-    Vec4<Tc> center;
-
-    if (leafIdx < numLeaves)
-    {
-        TreeNodeIndex begin = layout[leafIdx];
-        TreeNodeIndex end   = layout[leafIdx + 1];
-        center              = setCenter(begin, end, x, y, z, m);
-        for (int i = begin; i < end; i++)
-        {
-            Vec4<Tc> pos = {x[i], y[i], z[i], h[i]};
-            Xmin         = min(Xmin, pos);
-            Xmax         = max(Xmax, pos);
-        }
-        TreeNodeIndex cellIdx = leafToInternal[leafIdx];
-        centers[cellIdx]      = center;
-        cellXmin[cellIdx]     = Xmin;
-        cellXmax[cellIdx]     = Xmax;
-    }
-}
-
-/*! @brief perform source expansion center upward sweep for one tree level
- *
- * launch config: one thread per cell of the current level
- *
- * @param[in]  firstCell        first cell to process
- * @param[in]  lastCell         last cell to process
- * @param[in]  childOffsets     cell index of first child of each node
- * @param[out] centers          source expansion (mass) centers
- * @param[out] cellXmin         minimum coordinate of any body in the cell
- * @param[out] cellXmax         maximum coordinate of any body in the cell
- */
-template<class T>
-__global__ void upsweepCenters(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets,
-                               Vec4<T>* centers, Vec4<T>* cellXmin, Vec4<T>* cellXmax)
-{
-    const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell;
-    if (cellIdx >= lastCell) return;
-
-    const T huge = T(1e10);
-    Vec4<T> Xmin{+huge, +huge, +huge, +huge};
-    Vec4<T> Xmax{-huge, -huge, -huge, -huge};
-    Vec4<T> center;
-
-    TreeNodeIndex firstChild = childOffsets[cellIdx];
-
-    // firstChild is zero if the cell is a leaf
-    if (firstChild)
-    {
-        TreeNodeIndex begin = firstChild;
-        TreeNodeIndex end   = firstChild + 8;
-        center              = setCenter(begin, end, centers);
-        for (int i = begin; i < end; i++)
-        {
-            Xmin = min(Xmin, cellXmin[i]);
-            Xmax = max(Xmax, cellXmax[i]);
-        }
-        centers[cellIdx]  = center;
-        cellXmin[cellIdx] = Xmin;
-        cellXmax[cellIdx] = Xmax;
-    }
-}
-
-//! @brief calculate the squared MAC radius of each cell, store as the 4-th member sourceCenter
-template<class T>
-__global__ void setMAC(int numCells, T invTheta, Vec4<T>* sourceCenter, const Vec4<T>* cellXmin,
-                       const Vec4<T>* cellXmax)
-{
-    int cellIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (cellIdx >= numCells) { return; }
-
-    Vec3<T> Xmin = makeVec3(cellXmin[cellIdx]);
-    Vec3<T> Xmax = makeVec3(cellXmax[cellIdx]);
-    T       Hmax = cellXmax[cellIdx][3];
-    Vec3<T> Xi   = makeVec3(sourceCenter[cellIdx]);
-    Vec3<T> X    = (Xmax + Xmin) * T(0.5);
-    Vec3<T> R    = (Xmax - Xmin) * T(0.5);
-    Vec3<T> dX   = X - Xi;
-    T       s    = sqrt(norm2(dX));
-    T       l    = T(2) * max(R);
-    T       MAC  = max(l * invTheta + s, 2 * Hmax + s + l * 0.5);
-    T       MAC2 = MAC * MAC;
-
-    sourceCenter[cellIdx][3] = MAC2;
-}
-
-template<class MType>
-__global__ void normalize(int numCells, MType* multipoles)
-{
-    using T     = typename MType::value_type;
-    int cellIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (cellIdx >= numCells) { return; }
-
-    multipoles[cellIdx] = normalize(multipoles[cellIdx]);
-}
-
-template<class T, class MType>
-void upsweep(int numSources, int numLeaves, int numLevels, T theta, const int2* levelRange, const T* x, const T* y,
-             const T* z, const T* m, const T* h, const LocalIndex* layout, const TreeNodeIndex* childOffsets,
-             const TreeNodeIndex* leafToInternal, Vec4<T>* centers, MType* Multipole)
-{
-    constexpr int numThreads = UpsweepConfig::numThreads;
-
-    thrust::device_vector<Vec4<T>> d_cellXminmax(2 * numSources);
-    Vec4<T>*                       cellXmin = rawPtr(d_cellXminmax);
-    Vec4<T>*                       cellXmax = cellXmin + numSources;
-
-    auto t0 = std::chrono::high_resolution_clock::now();
-
-    computeLeafCenters<<<(numLeaves - 1) / numThreads + 1, numThreads>>>(x, y, z, m, h, leafToInternal, numLeaves,
-                                                                         layout, centers, cellXmin, cellXmax);
-    for (int level = numLevels - 1; level >= 1; level--)
-    {
-        int numCellsLevel = levelRange[level].y - levelRange[level].x;
-        int numBlocks     = (numCellsLevel - 1) / numThreads + 1;
-        upsweepCenters<<<numBlocks, numThreads>>>(levelRange[level].x, levelRange[level].y, childOffsets, centers,
-                                                  cellXmin, cellXmax);
-    }
-
-    computeLeafMultipoles<<<(numLeaves - 1) / numThreads + 1, numThreads>>>(x, y, z, m, leafToInternal, numLeaves,
-                                                                            layout, centers, Multipole);
-    for (int level = numLevels - 1; level >= 1; level--)
-    {
-        int numCellsLevel = levelRange[level].y - levelRange[level].x;
-        int numBlocks     = (numCellsLevel - 1) / numThreads + 1;
-        upsweepMultipoles<<<numBlocks, numThreads>>>(levelRange[level].x, levelRange[level].y, childOffsets, centers,
-                                                     Multipole);
-    }
-
-    kernelSuccess("upwardPass");
-
-    int numBlocks = (numSources - 1) / numThreads + 1;
-    setMAC<<<numBlocks, numThreads>>>(numSources, T(1.0) / theta, centers, cellXmin, cellXmax);
-    normalize<<<numBlocks, numThreads>>>(numSources, Multipole);
-
-    auto   t1 = std::chrono::high_resolution_clock::now();
-    double dt = std::chrono::duration<double>(t1 - t0).count();
-
-    fprintf(stdout, "Upward pass          : %.7f s\n", dt);
-}
-
-} // namespace ryoanji
diff --git a/ryoanji/test/CMakeLists.txt b/ryoanji/test/CMakeLists.txt
index 4c1514d93..b7e6b3684 100644
--- a/ryoanji/test/CMakeLists.txt
+++ b/ryoanji/test/CMakeLists.txt
@@ -10,8 +10,9 @@ endif()
 
 if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
     add_executable(ryoanji_demo demo.cu)
-    target_link_libraries(ryoanji_demo PUBLIC cstone_tree)
+    target_link_libraries(ryoanji_demo PUBLIC ryoanji_kernels cstone_gpu)
     set_target_properties(ryoanji_demo PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_include_directories(ryoanji_demo PRIVATE ${RYOANJI_TEST_INCLUDE_DIRS})
     # set_source_files_properties(demo.cu PROPERTIES COMPILE_FLAGS "-Xptxas -v")
     install(TARGETS ryoanji_demo RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}/ryoanji/ryoanji_demo)
 endif ()
@@ -24,10 +25,6 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
 
     target_link_libraries(${testname} PRIVATE cstone_gpu ryoanji)
 
-    if (GPU_DIRECT)
-        target_compile_definitions(${testname} PRIVATE USE_GPU_DIRECT)
-    endif ()
-
     if (CMAKE_HIP_COMPILER)
         target_link_libraries(${testname} PRIVATE hip::host)
         target_compile_definitions(${testname} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
@@ -70,7 +67,7 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
             nbody/direct.cu
             test_main.cpp)
     target_include_directories(${testname} PRIVATE ${PROJECT_SOURCE_DIR}/src)
-    target_link_libraries(${testname} PUBLIC OpenMP::OpenMP_CXX cstone_tree ryoanji GTest::gtest_main)
+    target_link_libraries(${testname} PUBLIC OpenMP::OpenMP_CXX ryoanji GTest::gtest_main)
     if (CMAKE_HIP_COMPILER)
         set_target_properties(${testname} PROPERTIES LINKER_LANGUAGE CXX)
     endif ()
diff --git a/ryoanji/test/demo.cu b/ryoanji/test/demo.cu
index fbf620943..c9a5ad422 100644
--- a/ryoanji/test/demo.cu
+++ b/ryoanji/test/demo.cu
@@ -1,8 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -23,12 +22,14 @@
  * SOFTWARE.
  */
 
-#include <chrono>
-#include <numeric>
+/*! @file
+ * @brief  Single-GPU demonstrator app for the Ryoanji N-body library
+ *
+ * @author Sebastian Keller <sebastian.f.keller@gmail.com>
+ * @author Rio Yokota <rioyokota@gsic.titech.ac.jp>
+ */
 
-#ifdef __HIPCC__
-#include <hip/hip_runtime.h>
-#endif
+#include <chrono>
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -36,21 +37,36 @@
 #include "cstone/cuda/cuda_utils.cuh"
 #include "cstone/cuda/gpu_config.cuh"
 #include "cstone/cuda/thrust_util.cuh"
+#include "cstone/focus/source_center_gpu.h"
+#include "cstone/traversal/groups_gpu.h"
+#include "cstone/util/array.hpp"
 
 #include "nbody/dataset.hpp"
 #include "ryoanji/interface/treebuilder.cuh"
 #include "ryoanji/nbody/types.h"
 #include "ryoanji/nbody/traversal.cuh"
 #include "ryoanji/nbody/direct.cuh"
-#include "ryoanji/nbody/upwardpass.cuh"
+#include "ryoanji/nbody/upsweep_gpu.h"
 
 using namespace ryoanji;
 
+template<class Tc, class Th, class Tm, class Ta, class Tf, class MType>
+util::array<Tc, 5> computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z,
+                                       const Tm* m, const Th* h, Tc G, int numShells, const cstone::Box<Tc>& box, Ta* p,
+                                       Ta* ax, Tc* ay, Tc* az, const TreeNodeIndex* childOffsets,
+                                       const TreeNodeIndex* internalToLeaf, const LocalIndex* layout,
+                                       const Vec4<Tf>* sourceCenter, const MType* Multipole);
+
+template<class KeyType, class T, class MType>
+void upsweep(int numSources, int numLeaves, int numLevels, float theta, const TreeNodeIndex* levelRange, const T* x,
+             const T* y, const T* z, const T* m, const cstone::Box<T>& box, const LocalIndex* layout,
+             const KeyType* prefixes, const TreeNodeIndex* childOffsets, const TreeNodeIndex* leafToInternal,
+             Vec4<T>* centers, MType* Multipole);
+
 int main(int argc, char** argv)
 {
-    constexpr int P     = 4;
     using T             = float;
-    using MultipoleType = SphericalMultipole<T, P>;
+    using MultipoleType = CartesianQuadrupole<T>;
 
     int power     = argc > 1 ? std::stoi(argv[1]) : 17;
     int directRef = argc > 2 ? std::stoi(argv[2]) : 1;
@@ -65,7 +81,6 @@ int main(int argc, char** argv)
 
     fprintf(stdout, "--- BH Parameters ---------------\n");
     fprintf(stdout, "numBodies            : %lu\n", numBodies);
-    fprintf(stdout, "P                    : %d\n", P);
     fprintf(stdout, "theta                : %f\n", theta);
     fprintf(stdout, "ncrit                : %d\n", ncrit);
 
@@ -77,17 +92,17 @@ int main(int argc, char** argv)
 
     cstone::Box<T> box(-boxSize, boxSize);
 
-    TreeBuilder<uint64_t> treeBuilder;
+    TreeBuilder<uint64_t> treeBuilder(ncrit);
     int                   numSources = treeBuilder.update(rawPtr(d_x), rawPtr(d_y), rawPtr(d_z), numBodies, box);
 
-    std::vector<int2> levelRange(treeBuilder.maxTreeLevel() + 1);
-    int               highestLevel = treeBuilder.extract(levelRange.data());
+    const TreeNodeIndex* levelRange   = treeBuilder.levelRange();
+    int                  highestLevel = treeBuilder.maxTreeLevel();
 
     thrust::device_vector<Vec4<T>>       sourceCenter(numSources);
     thrust::device_vector<MultipoleType> Multipole(numSources);
 
-    upsweep(numSources, treeBuilder.numLeafNodes(), highestLevel, theta, levelRange.data(), rawPtr(d_x), rawPtr(d_y),
-            rawPtr(d_z), rawPtr(d_m), rawPtr(d_h), treeBuilder.layout(), treeBuilder.childOffsets(),
+    upsweep(numSources, treeBuilder.numLeafNodes(), highestLevel, theta, levelRange, rawPtr(d_x), rawPtr(d_y),
+            rawPtr(d_z), rawPtr(d_m), box, treeBuilder.layout(), treeBuilder.nodeKeys(), treeBuilder.childOffsets(),
             treeBuilder.leafToInternal(), rawPtr(sourceCenter), rawPtr(Multipole));
 
     thrust::device_vector<T> d_p(numBodies, 0), d_ax(numBodies, 0), d_ay(numBodies, 0), d_az(numBodies, 0);
@@ -103,7 +118,7 @@ int main(int argc, char** argv)
 
     auto   t1    = std::chrono::high_resolution_clock::now();
     double dt    = std::chrono::duration<double>(t1 - t0).count();
-    double flops = (interactions[0] * 20 + interactions[2] * 2 * pow(P, 3)) * numBodies / dt / 1e12;
+    double flops = (interactions[0] * 23 + interactions[2] * 65) * numBodies / dt / 1e12;
 
     fprintf(stdout, "--- Total runtime ----------------\n");
     fprintf(stdout, "Total BH            : %.7f s (%.7f TFlops)\n", dt, flops);
@@ -119,7 +134,7 @@ int main(int argc, char** argv)
     t1 = std::chrono::high_resolution_clock::now();
     dt = std::chrono::duration<double>(t1 - t0).count();
 
-    flops = std::pow((2 * numShells + 1), 3) * 24. * numBodies * numBodies / dt / 1e12;
+    flops = std::pow((2 * numShells + 1), 3) * 23. * numBodies * numBodies / dt / 1e12;
     fprintf(stdout, "Total Direct         : %.7f s (%.7f TFlops)\n", dt, flops);
 
     thrust::host_vector<T> h_p  = d_p;
@@ -165,3 +180,95 @@ int main(int argc, char** argv)
 
     return 0;
 }
+
+/*! @brief Compute approximate body accelerations with Barnes-Hut
+ *
+ * @param[in]    firstBody      index of first body in @p bodyPos to compute acceleration for
+ * @param[in]    lastBody       index (exclusive) of last body in @p bodyPos to compute acceleration for
+ * @param[in]    x,y,z,m,h      bodies, in SFC order and as referenced by sourceCells
+ * @param[in]    G              gravitational constant
+ * @param[in]    numShells      number of periodic shells in each dimension to include
+ * @param[in]    box            coordinate bounding box
+ * @param[inout] p              body potential to add to, on device
+ * @param[inout] ax,ay,az       body acceleration to add to
+ * @param[in]    childOffsets   location (index in [0:numTreeNodes]) of first child of each cell, 0 indicates a leaf
+ * @param[in]    internalToLeaf for each cell in [0:numTreeNodes], stores the leaf cell (cstone) index in [0:numLeaves]
+ *                              if the cell is not a leaf, the value is negative
+ * @param[in]    layout         for each leaf cell in [0:numLeaves], stores the index of the first body in the cell
+ * @param[in]    sourceCenter   x,y,z center and square MAC radius of each cell in [0:numTreeNodes]
+ * @param[in]    Multipole      cell multipoles, on device
+ * @return                      P2P and M2P interaction statistics
+ */
+template<class Tc, class Th, class Tm, class Ta, class Tf, class MType>
+util::array<Tc, 5> computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z,
+                                       const Tm* m, const Th* h, Tc G, int numShells, const cstone::Box<Tc>& box, Ta* p,
+                                       Ta* ax, Tc* ay, Tc* az, const TreeNodeIndex* childOffsets,
+                                       const TreeNodeIndex* internalToLeaf, const LocalIndex* layout,
+                                       const Vec4<Tf>* sourceCenter, const MType* Multipole)
+{
+    constexpr int numWarpsPerBlock = TravConfig::numThreads / GpuConfig::warpSize;
+
+    cstone::GroupData<cstone::GpuTag> groups;
+    cstone::computeFixedGroups(firstBody, lastBody, TravConfig::targetSize, groups);
+
+    LocalIndex numBodies  = lastBody - firstBody;
+    int        numTargets = (numBodies - 1) / TravConfig::targetSize + 1;
+    int        numBlocks  = (numTargets - 1) / numWarpsPerBlock + 1;
+    numBlocks             = std::min(numBlocks, TravConfig::maxNumActiveBlocks);
+
+    printf("launching %d blocks\n", numBlocks);
+
+    const int                  poolSize = TravConfig::memPerWarp * numWarpsPerBlock * numBlocks;
+    thrust::device_vector<int> globalPool(poolSize);
+
+    resetTraversalCounters<<<1, 1>>>();
+    traverse<<<numBlocks, TravConfig::numThreads>>>(
+        groups.view(), 1, x, y, z, m, h, childOffsets, internalToLeaf, layout, sourceCenter, Multipole, G, numShells,
+        {box.lx(), box.ly(), box.lz()}, p, ax, ay, az, thrust::raw_pointer_cast(globalPool.data()));
+    kernelSuccess("traverse");
+
+    typename BhStats::type stats[BhStats::numStats];
+    checkGpuErrors(cudaMemcpyFromSymbol(stats, GPU_SYMBOL(bhStats), BhStats::numStats * sizeof(BhStats::type)));
+
+    auto sumP2P = stats[BhStats::sumP2P];
+    auto maxP2P = stats[BhStats::maxP2P];
+    auto sumM2P = stats[BhStats::sumM2P];
+    auto maxM2P = stats[BhStats::maxM2P];
+
+    float totalPotential;
+    checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalPotentialGlob), sizeof(float)));
+
+    util::array<Tc, 5> interactions;
+    interactions[0] = Tc(sumP2P) / Tc(numBodies);
+    interactions[1] = Tc(maxP2P);
+    interactions[2] = Tc(sumM2P) / Tc(numBodies);
+    interactions[3] = Tc(maxM2P);
+    interactions[4] = totalPotential;
+
+    return interactions;
+}
+
+template<class KeyType, class T, class MType>
+void upsweep(int numSources, int numLeaves, int numLevels, float theta, const TreeNodeIndex* levelRange, const T* x,
+             const T* y, const T* z, const T* m, const cstone::Box<T>& box, const LocalIndex* layout,
+             const KeyType* prefixes, const TreeNodeIndex* childOffsets, const TreeNodeIndex* leafToInternal,
+             Vec4<T>* centers, MType* Multipole)
+{
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    cstone::computeLeafSourceCenterGpu(x, y, z, m, leafToInternal, numLeaves, layout, centers);
+    cstone::upsweepCentersGpu(cstone::maxTreeLevel<KeyType>{}, levelRange, childOffsets, centers);
+
+    computeLeafMultipoles(x, y, z, m, leafToInternal, numLeaves, layout, centers, Multipole);
+    for (int level = numLevels - 1; level >= 1; level--)
+    {
+        upsweepMultipoles(levelRange[level], levelRange[level + 1], childOffsets, centers, Multipole);
+    }
+
+    cstone::setMacGpu(prefixes, numSources, centers, 1.f / theta, box);
+
+    auto   t1 = std::chrono::high_resolution_clock::now();
+    double dt = std::chrono::duration<double>(t1 - t0).count();
+
+    fprintf(stdout, "Upward pass          : %.7f s\n", dt);
+}
diff --git a/ryoanji/test/demo_mpi.cpp b/ryoanji/test/demo_mpi.cpp
index 1211f9f1f..fcf972e6d 100644
--- a/ryoanji/test/demo_mpi.cpp
+++ b/ryoanji/test/demo_mpi.cpp
@@ -47,8 +47,7 @@ using namespace ryoanji;
 template<class T, class KeyType>
 void ryoanjiTest(int thisRank, int numRanks, size_t numParticlesGlobal)
 {
-    constexpr int P                = 4;
-    using MultipoleType            = SphericalMultipole<T, P>;
+    using MultipoleType            = CartesianQuadrupole<T>;
     size_t   numParticles          = numParticlesGlobal / numRanks;
     unsigned bucketSizeFocus       = 64;
     unsigned numGlobalNodesPerRank = 100;
@@ -114,7 +113,7 @@ void ryoanjiTest(int thisRank, int numRanks, size_t numParticlesGlobal)
     mpiAllreduce(&totalPotential, &totalPotentialGlobal, 1, MPI_SUM);
 
     auto [numP2P, maxP2P, numM2P, maxM2P, maxStack] = multipoleHolder.readStats();
-    double flops                                    = (numP2P * 23 + numM2P * 2 * pow(P, 3)) / dt / 1e12;
+    double flops                                    = (numP2P * 23 + numM2P * 65) / dt / 1e12;
 
     for (int rank = 0; rank < numRanks; ++rank)
     {
diff --git a/ryoanji/test/interface/CMakeLists.txt b/ryoanji/test/interface/CMakeLists.txt
index baf0a304c..ca2fac30b 100644
--- a/ryoanji/test/interface/CMakeLists.txt
+++ b/ryoanji/test/interface/CMakeLists.txt
@@ -17,10 +17,6 @@ function(addRyoanjiGpuMpiTest source exename testname ranks)
         target_link_libraries(${exename} PRIVATE cstone_gpu ryoanji)
     endif()
 
-    if (GPU_DIRECT)
-        target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT)
-    endif ()
-
     if (CMAKE_HIP_COMPILER)
         target_link_libraries(${exename} PRIVATE hip::host)
         target_compile_definitions(${exename} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
diff --git a/ryoanji/test/interface/global_upsweep_gpu.cpp b/ryoanji/test/interface/global_upsweep_gpu.cpp
index 4ec180057..3103b37b9 100644
--- a/ryoanji/test/interface/global_upsweep_gpu.cpp
+++ b/ryoanji/test/interface/global_upsweep_gpu.cpp
@@ -46,7 +46,7 @@ using namespace ryoanji;
 template<class T, class KeyType>
 static int multipoleHolderTest(int thisRank, int numRanks)
 {
-    using MultipoleType              = SphericalMultipole<T, 4>;
+    using MultipoleType              = CartesianQuadrupole<T>;
     const LocalIndex numParticles    = 1000 * numRanks;
     unsigned         bucketSize      = 64;
     unsigned         bucketSizeLocal = 16;
diff --git a/ryoanji/test/nbody/ewald.cu b/ryoanji/test/nbody/ewald.cu
index 962d57c48..8f5afe514 100644
--- a/ryoanji/test/nbody/ewald.cu
+++ b/ryoanji/test/nbody/ewald.cu
@@ -12,7 +12,7 @@
 #include "cstone/cuda/cuda_utils.cuh"
 #include "cstone/cuda/thrust_util.cuh"
 #include "cstone/focus/source_center.hpp"
-#include "cstone/traversal/groups.cuh"
+#include "cstone/traversal/groups_gpu.cu"
 
 #include "dataset.hpp"
 #include "ryoanji/nbody/cartesian_qpole.hpp"
diff --git a/ryoanji/test/nbody/ewald_cpu.cpp b/ryoanji/test/nbody/ewald_cpu.cpp
index 71ed1480e..3ed6dba46 100644
--- a/ryoanji/test/nbody/ewald_cpu.cpp
+++ b/ryoanji/test/nbody/ewald_cpu.cpp
@@ -47,6 +47,16 @@ const int TEST_RNG_SEED = 42;
 const int verbose = 0;
 #define V(level) if ((level) == verbose)
 
+template<class T>
+std::ostream& operator<<(std::ostream& os, const CartesianQuadrupole<T>& M)
+{
+    os << "m:  " << M[Cqi::mass] << " tr: " << M[Cqi::trace] << std::endl
+       << "xx: " << M[Cqi::qxx] << " xy: " << M[Cqi::qxy] << " xz: " << M[Cqi::qxz] << std::endl
+       << "yy: " << M[Cqi::qyy] << " yz: " << M[Cqi::qyz] << " zz: " << M[Cqi::qzz] << std::endl;
+
+    return os;
+}
+
 template<class T, class KeyType_>
 class GridCoordinates
 {
@@ -184,10 +194,6 @@ makeTestTree(Coords& coordinates, cstone::Box<T> box, float mass_scale, float th
     std::vector<MultipoleType> multipoles(octree.numNodes);
     computeLeafMultipoles(x, y, z, masses.data(), toInternal, layout.data(), centers.data(), multipoles.data());
     upsweepMultipoles(octree.levelRange, octree.childOffsets.data(), centers.data(), multipoles.data());
-    for (size_t i = 0; i < multipoles.size(); ++i)
-    {
-        multipoles[i] = ryoanji::normalize(multipoles[i]);
-    }
 
     T totalMass = std::accumulate(masses.begin(), masses.end(), 0.0);
     EXPECT_NEAR(totalMass, multipoles[0][ryoanji::Cqi::mass], 1e-6);
diff --git a/ryoanji/test/nbody/kernel.cpp b/ryoanji/test/nbody/kernel.cpp
index 29d830033..841b2c582 100644
--- a/ryoanji/test/nbody/kernel.cpp
+++ b/ryoanji/test/nbody/kernel.cpp
@@ -92,65 +92,3 @@ TEST(Gravity, P2PselfInteraction)
     auto self = P2P(acc, pos_i, pos_i, 1.0, h, h);
     EXPECT_EQ(self, acc);
 }
-
-TEST(Multipole, P2M)
-{
-    int numBodies = 1023;
-
-    std::vector<double> x(numBodies);
-    std::vector<double> y(numBodies);
-    std::vector<double> z(numBodies);
-    std::vector<double> m(numBodies);
-    std::vector<double> h(numBodies);
-
-    ryoanji::makeCubeBodies(x.data(), y.data(), z.data(), m.data(), h.data(), numBodies);
-
-    CartesianQuadrupole<double>      cartesianQuadrupole;
-    cstone::SourceCenterType<double> csCenter =
-        cstone::massCenter<double>(x.data(), y.data(), z.data(), m.data(), 0, numBodies);
-    P2M(x.data(), y.data(), z.data(), m.data(), 0, numBodies, csCenter, cartesianQuadrupole);
-
-    Vec4<double> centerMass = ryoanji::setCenter(0, numBodies, x.data(), y.data(), z.data(), m.data());
-
-    ryoanji::SphericalMultipole<double, 4> multipole;
-    std::fill(multipole.begin(), multipole.end(), 0.0);
-
-    ryoanji::P2M(x.data(), y.data(), z.data(), m.data(), 0, numBodies, centerMass, multipole);
-
-    EXPECT_NEAR(multipole[0], cartesianQuadrupole[Cqi::mass], 1e-6);
-
-    EXPECT_NEAR(centerMass[0], csCenter[0], 1e-6);
-    EXPECT_NEAR(centerMass[1], csCenter[1], 1e-6);
-    EXPECT_NEAR(centerMass[2], csCenter[2], 1e-6);
-    EXPECT_NEAR(centerMass[3], cartesianQuadrupole[Cqi::mass], 1e-6);
-
-    // compare M2P results on a test target
-    {
-        Vec3<double> testTarget{-8, -8, -8};
-
-        Vec4<double> accM2P{0, 0, 0, 0};
-        accM2P = ryoanji::M2P(accM2P, testTarget, util::makeVec3(centerMass), multipole);
-        // printf("test acceleration: %f %f %f %f\n", acc[0], acc[1], acc[2], acc[3]);
-
-        // cstone is less precise
-        // float ax = 0;
-        // float ay = 0;
-        // float az = 0;
-        // cstone::multipole2particle(
-        //    testTarget[0], testTarget[1], testTarget[2], cstoneMultipole, eps2, &ax, &ay, &az);
-        // printf("cstone test acceleration: %f %f %f\n", ax, ay, az);
-
-        Vec4<double> accP2P{0, 0, 0, 0};
-        for (int i = 0; i < numBodies; ++i)
-        {
-            accP2P = P2P(accP2P, testTarget, Vec3<double>{x[i], y[i], z[i]}, m[i], 0.0, 0.0);
-        }
-        // printf("direct acceleration: %f %f %f\n", axd, ayd, azd);
-
-        // compare ryoanji against the direct sum reference
-        EXPECT_NEAR(accM2P[0], accP2P[0], 3e-5);
-        EXPECT_NEAR(accM2P[1], accP2P[1], 1e-5);
-        EXPECT_NEAR(accM2P[2], accP2P[2], 1e-5);
-        EXPECT_NEAR(accM2P[3], accP2P[3], 1e-5);
-    }
-}
diff --git a/ryoanji/test/nbody/traversal_cpu.cpp b/ryoanji/test/nbody/traversal_cpu.cpp
index aea8b12aa..2c40e69de 100644
--- a/ryoanji/test/nbody/traversal_cpu.cpp
+++ b/ryoanji/test/nbody/traversal_cpu.cpp
@@ -88,10 +88,6 @@ TEST(Gravity, TreeWalk)
     std::vector<MultipoleType> multipoles(octree.numNodes);
     computeLeafMultipoles(x, y, z, masses.data(), toInternal, layout.data(), centers.data(), multipoles.data());
     upsweepMultipoles(octree.levelRange, octree.childOffsets.data(), centers.data(), multipoles.data());
-    for (size_t i = 0; i < multipoles.size(); ++i)
-    {
-        multipoles[i] = ryoanji::normalize(multipoles[i]);
-    }
 
     T totalMass = std::accumulate(masses.begin(), masses.end(), 0.0);
     EXPECT_TRUE(std::abs(totalMass - multipoles[0][ryoanji::Cqi::mass]) < 1e-6);
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
new file mode 100644
index 000000000..21ce10e94
--- /dev/null
+++ b/scripts/CMakeLists.txt
@@ -0,0 +1,6 @@
+# temporary fix for hip build
+if(NOT DEFINED CMAKE_INSTALL_BINDIR)
+  set(CMAKE_INSTALL_BINDIR "bin")
+endif(NOT DEFINED CMAKE_INSTALL_BINDIR)
+
+install(PROGRAMS plot_power.py set_parms.py DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/scripts/plot_power.py b/scripts/plot_power.py
new file mode 100755
index 000000000..8d5b9216b
--- /dev/null
+++ b/scripts/plot_power.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+"""
+SPH-EXA
+Copyright (c) 2024 CSCS, ETH Zurich
+
+Usage examples:
+    Run a simulation on a system that provides power counters (Cray) with profiling turned on
+    $ sphexa --init evrard --glass 50c.h5 -n 200 -s 10 --profile
+    $ plot_power.py profile.h5
+"""
+
+__author__ = "Sebastian Keller (sebastian.f.keller@gmail.com)"
+
+import numpy as np
+import h5py
+import sys
+
+import matplotlib
+import matplotlib.pyplot as plt
+
+font = {'size': 16}
+matplotlib.rc('font', **font)
+
+
+def convertToIntervals(data):
+    """ Convert numRanks x numInterations x numMeasurmentsPerIterations points to intervals """
+    return data[:, :, 1:] - data[:, :, :-1]
+
+
+def convertBarToLine(data):
+    """ Convert a series bar plots data points to equivalent line plot data """
+    return np.append(np.array([0]), np.repeat(data, 2))[:-1]
+
+
+def removeZeros(data):
+    """ Remove all elements in axis 0 where the sum across axes 1 and 2 is zero """
+    return data[np.sum(data, axis=(1, 2)) != 0.0]
+
+
+def loadField(fname, what, stepnr):
+    """ Load /Step#stepnr/what and /Step#stepnr/what_timeStams from file and reshape into
+        numRanks x numIterations x numMeasurementsPerIteration
+    """
+    h5file = h5py.File(fname, "r")
+    h5step = h5file["/Step#" + str(stepnr)]
+    numRanks = h5step.attrs["numRanks"][0]
+    numIterations = h5step.attrs["numIterations"][0]
+    raw = np.array(h5step[what])
+    numPoints = len(raw)
+    ret = np.reshape(raw, (numRanks, numIterations, int(numPoints / numRanks / numIterations)))
+    return removeZeros(ret)
+
+
+def loadEnergy(fname, what, stepnr):
+    """ Load what and what_timeStamps from file """
+
+    what_ts = what + "_timeStamps"
+
+    dt = 1e-6 * loadField(fname, what_ts, stepnr)
+    # average across ranks
+    dt_avg = np.average(convertToIntervals(dt), axis=0)
+    # scan to get total time and repeat data points to get flat lines within intervals
+    timeline = convertBarToLine(np.cumsum(dt_avg.flatten()))
+
+    energy = loadField(fname, what, stepnr)
+    # average across ranks
+    energy_avg = np.average(convertToIntervals(energy), axis=0)
+    power = (energy_avg / dt_avg)
+    # repeat to get flat lines within intervals
+    powerline = np.repeat(power.flatten(), 2)
+
+    print("Loaded energy data \"{0}\" with dimensions".format(what), energy.shape,
+          "= (numCounters x numIterations x numMeasurementsPerIteration)")
+
+    fields = {}
+    fields["x"] = timeline
+    fields["y"] = powerline
+    fields["numCounters"] = energy.shape[0]
+
+    return fields
+
+
+def plotModulePower(fname):
+    modulePower = loadEnergy(fname, "acc", 2)
+    numAccelerators = modulePower["numCounters"]
+
+    fig = plt.figure(figsize=(15, 11))
+    plt.title("SPH-EXA, power consumption, averaged over %d accelerators" % numAccelerators)
+    plt.xlabel("runtime [s]", fontsize=16)
+    plt.ylabel("average module power [W]", fontsize=16)
+    plt.plot(modulePower["x"], modulePower["y"], linewidth=2, color="g", label="module power")
+    plt.savefig("sphexa-accel-power.png", bbox_inches="tight")
+
+
+def plotNodePower(fname):
+    nodePower = loadEnergy(fname, "node", 1)
+    numNodes = nodePower["numCounters"]
+
+    fig = plt.figure(figsize=(15, 11))
+    plt.title("SPH-EXA, power consumption, averaged over %d nodes" % numNodes)
+    plt.xlabel("runtime [s]", fontsize=16)
+    plt.ylabel("average node power [W]", fontsize=16)
+
+    plt.axhline(y=np.max(nodePower["y"]), color="b", linestyle="--", linewidth=1,
+                label="%dW max measurement" % np.max(nodePower["y"]))
+
+    simlabel = "simulation, (average power per node for a %d-node simulation)" % numNodes
+    plt.plot(nodePower["x"], nodePower["y"], linewidth=2, color="g", label=simlabel)
+
+    plt.legend(loc="lower left")
+    plt.savefig("sphexa-node-power.png", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+    plotModulePower(sys.argv[1])
+    plotNodePower(sys.argv[1])
diff --git a/scripts/spack/package.py b/scripts/spack/package.py
deleted file mode 100644
index 231f6ec45..000000000
--- a/scripts/spack/package.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2021 CSCS, ETH Zurich
-#               2021 University of Basel
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-# ----------------------------------------------------------------------------
-
-from spack import *
-
-
-class SphExaMiniApp(CMakePackage, CudaPackage):
-    """
-    The SPH-EXA mini-app is a scientific research code; its goal is to scale
-    the Smoothed Particle Hydrodynamics (SPH) method to enable Tier-0 and
-    Exascale simulations: https://hpc.dmi.unibas.ch/en/research/sph-exa/
-    Example: spack install sph-exa-mini-app+cuda %gcc@9.3.0.21.05
-    """
-    homepage = "https://github.com/unibas-dmi-hpc/SPH-EXA_mini-app"
-    git = "https://github.com/unibas-dmi-hpc/SPH-EXA_mini-app.git"
-    maintainers = ['jgphpc', 'sebkelle1']
-    version('develop', branch='develop', submodules=True, preferred=True)
-    depends_on('cmake@3.20.2:', type='build')
-    depends_on('mpi')
-    depends_on('cuda@10:', when='+cuda')
-    variant('mpi', default=True, description='Enable MPI support')
-    variant('cuda', default=False, description='Enable CUDA support')
-    sanity_check_is_file = [join_path('bin', 'sedov')]
-
-    def cmake_args(self):
-        # Add arguments other than CMAKE_INSTALL_PREFIX and CMAKE_BUILD_TYPE
-        args = ['-DCMAKE_VERBOSE_MAKEFILE=ON']
-        args.append(f'-DCMAKE_CXX_COMPILER={self.compiler.cxx}')
-        # -DUSE_CUDA set in src/sedov/CMakeLists.txt target_compile_definitions
-        if '+cuda' in self.spec:
-            sanity_check_is_file = [join_path('bin', 'sedov-cuda')]
-
-        return args
diff --git a/sph/include/sph/CMakeLists.txt b/sph/include/sph/CMakeLists.txt
index 76d68beca..64ec7e99c 100644
--- a/sph/include/sph/CMakeLists.txt
+++ b/sph/include/sph/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(hydro_turb)
 add_subdirectory(hydro_ve)
 
 set(TIMESTEP_GPU_KERNELS
-        groups.cu positions_gpu.cu ts_groups.cu update_h_gpu.cu
+        positions_gpu.cu ts_groups.cu update_h_gpu.cu
 )
 
 if (CMAKE_HIP_COMPILER)
diff --git a/sph/include/sph/groups.cu b/sph/include/sph/groups.cu
deleted file mode 100644
index 06e231e22..000000000
--- a/sph/include/sph/groups.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-/*! @file
- * @brief Compute groups of target particles that a) fit in a warp and b) have compact bounding boxes
- *
- * @author Sebastian Keller <sebastian.f.keller@gmail.com>
- */
-
-#include "thrust/device_vector.h"
-
-#include "cstone/traversal/find_neighbors.cuh"
-#include "cstone/traversal/groups.cuh"
-
-#include "sph/sph_gpu.hpp"
-#include "sph/particles_data.hpp"
-
-namespace sph
-{
-
-using cstone::GpuConfig;
-using cstone::TravConfig;
-
-/*! @brief Compute groups of SFC-consecutive particles with compact bounding volumes
- *
- * @tparam     Dataset
- * @param[in]  startIndex   first particle index to include in groups
- * @param[in]  endIndex     last particle index in include in groups
- * @param[in]  d            Dataset with particle x,y,z,h arrays
- * @param[in]  box          global coordinate bounding box
- * @param[out] groups       output particle groups
- */
-template<class Dataset>
-void computeSpatialGroups(size_t startIndex, size_t endIndex, Dataset& d,
-                          const cstone::Box<typename Dataset::RealType>& box, GroupData<cstone::GpuTag>& groups)
-{
-    cstone::DeviceVector<util::array<GpuConfig::ThreadMask, TravConfig::nwt>> S;
-
-    float tolFactor = 2.0f;
-    cstone::computeGroupSplits<TravConfig::targetSize>(startIndex, endIndex, rawPtr(d.devData.x), rawPtr(d.devData.y),
-                                                       rawPtr(d.devData.z), rawPtr(d.devData.h), d.treeView.leaves,
-                                                       d.treeView.numLeafNodes, d.treeView.layout, box, tolFactor, S,
-                                                       d.devData.traversalStack, groups.data);
-
-    groups.firstBody  = startIndex;
-    groups.lastBody   = endIndex;
-    groups.numGroups  = groups.data.size() - 1;
-    groups.groupStart = rawPtr(groups.data);
-    groups.groupEnd   = rawPtr(groups.data) + 1;
-}
-
-template void computeSpatialGroups(size_t, size_t, sphexa::ParticlesData<cstone::GpuTag>&,
-                                   const cstone::Box<SphTypes::CoordinateType>&, GroupData<cstone::GpuTag>&);
-
-} // namespace sph
diff --git a/sph/include/sph/groups.hpp b/sph/include/sph/groups.hpp
index e55eef642..c6e4b12b9 100644
--- a/sph/include/sph/groups.hpp
+++ b/sph/include/sph/groups.hpp
@@ -1,3 +1,27 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 /*! @file
  * @brief Target particle group configuration
  *
@@ -8,11 +32,28 @@
 
 #include "cstone/sfc/box.hpp"
 #include "cstone/primitives/primitives_gpu.h"
+#include "cstone/traversal/groups_gpu.h"
 #include "sph/sph_gpu.hpp"
 
 namespace sph
 {
 
+template<class Dataset>
+void computeSpatialGroups(cstone::LocalIndex startIndex, cstone::LocalIndex endIndex, Dataset& d,
+                          const cstone::Box<typename Dataset::RealType>& box, GroupData<cstone::GpuTag>& groups)
+{
+    float tolFactor = 2.0f;
+    cstone::computeGroupSplits(startIndex, endIndex, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z),
+                               rawPtr(d.devData.h), d.treeView.leaves, d.treeView.numLeafNodes, d.treeView.layout, box,
+                               nsGroupSize(), tolFactor, d.devData.traversalStack, groups.data);
+
+    groups.firstBody  = startIndex;
+    groups.lastBody   = endIndex;
+    groups.numGroups  = groups.data.size() - 1;
+    groups.groupStart = rawPtr(groups.data);
+    groups.groupEnd   = rawPtr(groups.data) + 1;
+}
+
 //! @brief Compute spatial (=SFC-consecutive) groups of particles with compact bounding boxes
 template<typename Tc, class Dataset>
 void computeGroups(size_t startIndex, size_t endIndex, Dataset& d, const cstone::Box<Tc>& box,
diff --git a/sph/include/sph/hydro_std/iad_gpu.cu b/sph/include/sph/hydro_std/iad_gpu.cu
index 3d4b97310..b440a932f 100644
--- a/sph/include/sph/hydro_std/iad_gpu.cu
+++ b/sph/include/sph/hydro_std/iad_gpu.cu
@@ -31,7 +31,6 @@
  */
 
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/findneighbors.hpp"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
diff --git a/sph/include/sph/hydro_std/momentum_energy_gpu.cu b/sph/include/sph/hydro_std/momentum_energy_gpu.cu
index 541283783..81d624857 100644
--- a/sph/include/sph/hydro_std/momentum_energy_gpu.cu
+++ b/sph/include/sph/hydro_std/momentum_energy_gpu.cu
@@ -29,15 +29,13 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
-#include <cub/cub.cuh>
-
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/findneighbors.hpp"
+#include "cstone/primitives/warpscan.cuh"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
 #include "sph/particles_data.hpp"
-#include "sph/util/device_math.cuh"
 #include "sph/hydro_std/momentum_energy_kern.hpp"
 
 namespace sph
@@ -99,7 +97,7 @@ __global__ void cudaGradP(Tc K, Tc Kcour, unsigned ngmax, cstone::Box<Tc> box, c
     T           blockMin = reduce.Reduce(dt_i, cub::Min());
     __syncthreads();
 
-    if (threadIdx.x == 0) { atomicMinFloat(&minDt_device, blockMin); }
+    if (threadIdx.x == 0) { cstone::atomicMinFloat(&minDt_device, blockMin); }
 }
 
 template<class Dataset>
@@ -109,7 +107,7 @@ void computeMomentumEnergyStdGpu(const GroupView& grp, Dataset& d, const cstone:
     cstone::resetTraversalCounters<<<1, 1>>>();
 
     float huge = 1e10;
-    checkGpuErrors(cudaMemcpyToSymbol(minDt_device, &huge, sizeof(huge)));
+    checkGpuErrors(cudaMemcpyToSymbol(GPU_SYMBOL(minDt_device), &huge, sizeof(huge)));
     cstone::resetTraversalCounters<<<1, 1>>>();
 
     cudaGradP<<<TravConfig::numBlocks(), TravConfig::numThreads>>>(
@@ -123,7 +121,7 @@ void computeMomentumEnergyStdGpu(const GroupView& grp, Dataset& d, const cstone:
     checkGpuErrors(cudaGetLastError());
 
     float minDt;
-    checkGpuErrors(cudaMemcpyFromSymbol(&minDt, minDt_device, sizeof(minDt)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&minDt, GPU_SYMBOL(minDt_device), sizeof(minDt)));
     d.minDtCourant = minDt;
 }
 
diff --git a/sph/include/sph/hydro_ve/av_switches_gpu.cu b/sph/include/sph/hydro_ve/av_switches_gpu.cu
index 929ebdb18..90376677e 100644
--- a/sph/include/sph/hydro_ve/av_switches_gpu.cu
+++ b/sph/include/sph/hydro_ve/av_switches_gpu.cu
@@ -30,7 +30,6 @@
  */
 
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/findneighbors.hpp"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
diff --git a/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu b/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu
index ad7384252..5f34ef5c1 100644
--- a/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu
+++ b/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu
@@ -30,7 +30,6 @@
  */
 
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/findneighbors.hpp"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
diff --git a/sph/include/sph/hydro_ve/momentum_energy_gpu.cu b/sph/include/sph/hydro_ve/momentum_energy_gpu.cu
index b86733368..2b81b0737 100644
--- a/sph/include/sph/hydro_ve/momentum_energy_gpu.cu
+++ b/sph/include/sph/hydro_ve/momentum_energy_gpu.cu
@@ -29,14 +29,13 @@
  * @author Sebastian Keller <sebastian.f.keller@gmail.com>
  */
 
-#include <cub/cub.cuh>
-
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/cuda_utils.cuh"
+#include "cstone/primitives/warpscan.cuh"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
 #include "sph/particles_data.hpp"
-#include "sph/util/device_math.cuh"
 #include "sph/hydro_ve/momentum_energy_kern.hpp"
 
 namespace sph
@@ -114,7 +113,7 @@ __global__ void momentumEnergyGpu(Tc K, Tc Kcour, T Atmin, T Atmax, T ramp, unsi
     T           blockMin = reduce.Reduce(dt_i, cub::Min());
     __syncthreads();
 
-    if (threadIdx.x == 0) { atomicMinFloat(&minDt_ve_device, blockMin); }
+    if (threadIdx.x == 0) { cstone::atomicMinFloat(&minDt_ve_device, blockMin); }
 }
 
 template<bool avClean, class Dataset>
@@ -124,7 +123,7 @@ void computeMomentumEnergy(const GroupView& grp, float* groupDt, Dataset& d,
     auto [traversalPool, nidxPool] = cstone::allocateNcStacks(d.devData.traversalStack, d.ngmax);
 
     float huge = 1e10;
-    checkGpuErrors(cudaMemcpyToSymbol(minDt_ve_device, &huge, sizeof(huge)));
+    checkGpuErrors(cudaMemcpyToSymbol(GPU_SYMBOL(minDt_ve_device), &huge, sizeof(huge)));
     cstone::resetTraversalCounters<<<1, 1>>>();
 
     momentumEnergyGpu<avClean><<<TravConfig::numBlocks(), TravConfig::numThreads>>>(
@@ -140,7 +139,7 @@ void computeMomentumEnergy(const GroupView& grp, float* groupDt, Dataset& d,
     checkGpuErrors(cudaGetLastError());
 
     float minDt;
-    checkGpuErrors(cudaMemcpyFromSymbol(&minDt, minDt_ve_device, sizeof(minDt)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&minDt, GPU_SYMBOL(minDt_ve_device), sizeof(minDt)));
     d.minDtCourant = minDt;
 }
 
diff --git a/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu b/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu
index 38916b97d..b9f622480 100644
--- a/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu
+++ b/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu
@@ -30,7 +30,6 @@
  */
 
 #include "cstone/cuda/cuda_utils.cuh"
-#include "cstone/findneighbors.hpp"
 #include "cstone/traversal/find_neighbors.cuh"
 
 #include "sph/sph_gpu.hpp"
diff --git a/sph/include/sph/hydro_ve/xmass_gpu.cu b/sph/include/sph/hydro_ve/xmass_gpu.cu
index cdbfa2115..c908fa2d6 100644
--- a/sph/include/sph/hydro_ve/xmass_gpu.cu
+++ b/sph/include/sph/hydro_ve/xmass_gpu.cu
@@ -1,8 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -47,6 +46,8 @@ using cstone::NcStats;
 using cstone::TravConfig;
 using cstone::TreeNodeIndex;
 
+unsigned nsGroupSize() { return TravConfig::targetSize; }
+
 namespace cuda
 {
 
@@ -113,10 +114,10 @@ void computeXMass(const GroupView& grp, Dataset& d, const cstone::Box<typename D
     checkGpuErrors(cudaDeviceSynchronize());
 
     NcStats::type stats[NcStats::numStats];
-    checkGpuErrors(cudaMemcpyFromSymbol(stats, cstone::ncStats, NcStats::numStats * sizeof(NcStats::type)));
+    checkGpuErrors(cudaMemcpyFromSymbol(stats, GPU_SYMBOL(cstone::ncStats), NcStats::numStats * sizeof(NcStats::type)));
 
     bool convergenceFailure;
-    checkGpuErrors(cudaMemcpyFromSymbol(&convergenceFailure, nc_h_convergenceFailure, sizeof(bool)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&convergenceFailure, GPU_SYMBOL(nc_h_convergenceFailure), sizeof(bool)));
 
     NcStats::type maxP2P   = stats[cstone::NcStats::maxP2P];
     NcStats::type maxStack = stats[cstone::NcStats::maxStack];
diff --git a/sph/include/sph/particles_data.hpp b/sph/include/sph/particles_data.hpp
index ea0612d4b..f10d71ce0 100644
--- a/sph/include/sph/particles_data.hpp
+++ b/sph/include/sph/particles_data.hpp
@@ -50,7 +50,6 @@
 #include "sph_kernel_tables.hpp"
 
 #if defined(USE_CUDA)
-#include "sph/util/pinned_allocator.cuh"
 #include "particles_data_gpu.cuh"
 #endif
 
diff --git a/sph/include/sph/particles_data_gpu.cuh b/sph/include/sph/particles_data_gpu.cuh
index 5b665e5fe..72ab8f5f8 100644
--- a/sph/include/sph/particles_data_gpu.cuh
+++ b/sph/include/sph/particles_data_gpu.cuh
@@ -30,7 +30,6 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include <variant>
 
 #include "cstone/cuda/cuda_utils.cuh"
diff --git a/sph/include/sph/sph_gpu.hpp b/sph/include/sph/sph_gpu.hpp
index c5f92f888..9298ba734 100644
--- a/sph/include/sph/sph_gpu.hpp
+++ b/sph/include/sph/sph_gpu.hpp
@@ -12,10 +12,6 @@ namespace sph
 using cstone::GroupData;
 using cstone::GroupView;
 
-template<class Dataset>
-extern void computeSpatialGroups(size_t, size_t, Dataset& d, const cstone::Box<typename Dataset::RealType>&,
-                                 GroupData<cstone::GpuTag>&);
-
 template<class Dataset>
 extern void computeIADGpu(const GroupView&, Dataset& d, const cstone::Box<typename Dataset::RealType>&);
 
@@ -76,4 +72,7 @@ extern void groupAccTimestepGpu(float etaAcc, const GroupView&, const T* ax, con
 
 void storeRungGpu(const GroupView& grp, uint8_t rung, uint8_t* particleRungs);
 
+//! @brief max number of particles per group used in neighbor search for SPH
+unsigned nsGroupSize();
+
 } // namespace sph
diff --git a/sph/include/sph/util/device_math.cuh b/sph/include/sph/util/device_math.cuh
deleted file mode 100644
index e7a085495..000000000
--- a/sph/include/sph/util/device_math.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2022 CSCS, ETH Zurich
- *               2022 University of Basel
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*! @file
- * @brief GPU specific math functions
- * @author Sebastian Keller <sebastian.f.keller@gmail.com>
- */
-
-#pragma once
-
-__device__ __forceinline__ float atomicMinFloat(float* addr, float value)
-{
-    float old;
-    old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
-                       : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
-
-    return old;
-}
\ No newline at end of file
diff --git a/sph/include/sph/util/pinned_allocator.cuh b/sph/include/sph/util/pinned_allocator.cuh
deleted file mode 100644
index 283c6ad94..000000000
--- a/sph/include/sph/util/pinned_allocator.cuh
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! @file
- *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
- */
-
-#pragma once
-
-#include <stdexcept>
-#include <limits>
-#include <string>
-
-#include <cuda_runtime.h>
-
-/*! @brief pinned_allocator is a CUDA-specific host memory allocator
- *  that employs @c cudaMallocHost for allocation.
- *
- *  @see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T>
-class pinned_allocator;
-
-template<>
-class pinned_allocator<void>
-{
-public:
-    typedef void           value_type;
-    typedef void*          pointer;
-    typedef const void*    const_pointer;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<void> to pinned_allocator<U>
-    template<typename U>
-    struct rebind
-    {
-        typedef pinned_allocator<U> other;
-    }; // end rebind
-};     // end pinned_allocator
-
-template<typename T>
-class pinned_allocator
-{
-public:
-    //! \{
-    typedef T              value_type;
-    typedef T*             pointer;
-    typedef const T*       const_pointer;
-    typedef T&             reference;
-    typedef const T&       const_reference;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    //! \}
-
-    // convert a pinned_allocator<T> to pinned_allocator<U>
-    template<typename U>
-    struct rebind
-    {
-        typedef pinned_allocator<U> other;
-    }; // end rebind
-
-    /*! \p pinned_allocator's null constructor does nothing.
-     */
-    inline pinned_allocator() {}
-
-    /*! \p pinned_allocator's null destructor does nothing.
-     */
-    inline ~pinned_allocator() {}
-
-    /*! \p pinned_allocator's copy constructor does nothing.
-     */
-    inline pinned_allocator(pinned_allocator const&) {}
-
-    /*! This version of \p pinned_allocator's copy constructor
-     *  is templated on the \c value_type of the \p pinned_allocator
-     *  to copy from.  It is provided merely for convenience; it
-     *  does nothing.
-     */
-    template<typename U>
-    inline pinned_allocator(pinned_allocator<U> const&)
-    {
-    }
-
-    /*! This method returns the address of a \c reference of
-     *  interest.
-     *
-     *  \p r The \c reference of interest.
-     *  \return \c r's address.
-     */
-    inline pointer address(reference r) { return &r; }
-
-    /*! This method returns the address of a \c const_reference
-     *  of interest.
-     *
-     *  \p r The \c const_reference of interest.
-     *  \return \c r's address.
-     */
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! This method allocates storage for objects in pinned host
-     *  memory.
-     *
-     *  \p cnt The number of objects to allocate.
-     *  \return a \c pointer to the newly allocated objects.
-     *  \note This method does not invoke \p value_type's constructor.
-     *        It is the responsibility of the caller to initialize the
-     *        objects at the returned \c pointer.
-     */
-    inline pointer allocate(size_type cnt, const_pointer = 0)
-    {
-        if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if
-
-        pointer     result(0);
-        cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
-
-        if (error) { throw std::bad_alloc(); } // end if
-
-        return result;
-    } // end allocate()
-
-    /*! This method deallocates pinned host memory previously allocated
-     *  with this \c pinned_allocator.
-     *
-     *  \p p A \c pointer to the previously allocated memory.
-     *  \p cnt The number of objects previously allocated at
-     *         \p p.
-     *  \note This method does not invoke \p value_type's destructor.
-     *        It is the responsibility of the caller to destroy
-     *        the objects stored at \p p.
-     */
-    inline void deallocate(pointer p, size_type /*cnt*/)
-    {
-        cudaError_t error = cudaFreeHost(p);
-
-        if (error) { throw std::runtime_error("cudaFreeHost error\n"); } // end if
-    }                                                                    // end deallocate()
-
-    /*! This method returns the maximum size of the \c cnt parameter
-     *  accepted by the \p allocate() method.
-     *
-     *  \return The maximum number of objects that may be allocated
-     *          by a single call to \p allocate().
-     */
-    inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); } // end max_size()
-
-    /*! This method tests this \p pinned_allocator for equality to
-     *  another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c true.
-     */
-    inline bool operator==([[maybe_unused]] pinned_allocator const& x) const { return true; }
-
-    /*! This method tests this \p pinned_allocator for inequality
-     *  to another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c false.
-     */
-    inline bool operator!=(pinned_allocator const& x) const { return !operator==(x); }
-}; // end pinned_allocator
-
-/*! \}
- */