From 4be3f10a95f260d4e3fb901e98a8336ac6af2478 Mon Sep 17 00:00:00 2001 From: Sebastian Keller <44800242+sekelle@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:15:04 +0100 Subject: [PATCH] HIP and Spack (#468) * HIP compatibility, remove need for hipify-perl * split GPU N-body code into additional units handle gpu-direct with interface compile_definition removed duplicate cartesian multipole implementation --- .gitlab/Dockerfile_hip_2 | 10 +- CMakeLists.txt | 47 +- README.md | 9 +- README_insitu.md | 1 - domain/CMakeLists.txt | 43 +- domain/README.md | 2 +- domain/include/cstone/CMakeLists.txt | 4 + domain/include/cstone/cuda/cub.hpp | 14 + domain/include/cstone/cuda/cuda_runtime.hpp | 61 ++ domain/include/cstone/cuda/cuda_utils.cuh | 3 +- domain/include/cstone/cuda/device_vector.cu | 5 +- domain/include/cstone/cuda/errorcheck.cuh | 2 +- domain/include/cstone/cuda/gpu_config.cuh | 3 +- domain/include/cstone/focus/rebalance_gpu.cu | 9 +- .../include/cstone/focus/source_center_gpu.h | 1 + .../include/cstone/halos/gather_halos_gpu.cu | 1 + domain/include/cstone/primitives/mpi_cuda.cuh | 3 +- .../cstone/primitives/primitives_gpu.cu | 5 +- domain/include/cstone/primitives/warpscan.cuh | 14 +- .../include/cstone/traversal/CMakeLists.txt | 6 +- .../cstone/traversal/find_neighbors.cuh | 3 +- domain/include/cstone/traversal/groups_gpu.cu | 161 ++++++ .../traversal/{groups.cuh => groups_gpu.cuh} | 92 +-- domain/include/cstone/traversal/groups_gpu.h | 90 +++ domain/include/cstone/tree/csarray_gpu.cu | 1 + domain/test/integration_mpi/CMakeLists.txt | 3 - .../integration_mpi/exchange_halos_gpu.cpp | 2 +- domain/test/performance/CMakeLists.txt | 8 +- domain/test/performance/neighbor_driver.cu | 1 - domain/test/performance/timing.cuh | 19 +- domain/test/unit_cuda/traversal/groups.cu | 25 +- main/src/analytical_solutions/CMakeLists.txt | 7 +- main/src/propagator/CMakeLists.txt | 3 - main/src/sphexa/CMakeLists.txt | 8 - main/src/util/pm_reader.hpp | 63 +- ryoanji/CMakeLists.txt | 39 +- ryoanji/src/ryoanji/CMakeLists.txt | 1 + ryoanji/src/ryoanji/interface/CMakeLists.txt | 9 +- ryoanji/src/ryoanji/interface/ewald.cu | 5 +- .../src/ryoanji/interface/multipole_holder.cu | 52 +- ryoanji/src/ryoanji/interface/treebuilder.cu | 239 -------- ryoanji/src/ryoanji/interface/treebuilder.cuh | 99 +++- ryoanji/src/ryoanji/nbody/CMakeLists.txt | 8 + ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp | 21 - ryoanji/src/ryoanji/nbody/ewald.hpp | 8 - ryoanji/src/ryoanji/nbody/kernel.hpp | 546 +----------------- ryoanji/src/ryoanji/nbody/traversal.cuh | 77 +-- ryoanji/src/ryoanji/nbody/types.h | 41 +- ryoanji/src/ryoanji/nbody/upsweep_gpu.cu | 115 ++++ ryoanji/src/ryoanji/nbody/upsweep_gpu.h | 58 ++ ryoanji/src/ryoanji/nbody/upwardpass.cuh | 235 -------- ryoanji/test/CMakeLists.txt | 9 +- ryoanji/test/demo.cu | 143 ++++- ryoanji/test/demo_mpi.cpp | 5 +- ryoanji/test/interface/CMakeLists.txt | 4 - ryoanji/test/interface/global_upsweep_gpu.cpp | 2 +- ryoanji/test/nbody/ewald.cu | 2 +- ryoanji/test/nbody/ewald_cpu.cpp | 14 +- ryoanji/test/nbody/kernel.cpp | 62 -- ryoanji/test/nbody/traversal_cpu.cpp | 4 - scripts/CMakeLists.txt | 6 + scripts/plot_power.py | 117 ++++ scripts/spack/package.py | 54 -- sph/include/sph/CMakeLists.txt | 2 +- sph/include/sph/groups.cu | 52 -- sph/include/sph/groups.hpp | 41 ++ sph/include/sph/hydro_std/iad_gpu.cu | 1 - .../sph/hydro_std/momentum_energy_gpu.cu | 12 +- sph/include/sph/hydro_ve/av_switches_gpu.cu | 1 - .../sph/hydro_ve/iad_divv_curlv_gpu.cu | 1 - .../sph/hydro_ve/momentum_energy_gpu.cu | 11 +- sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu | 1 - sph/include/sph/hydro_ve/xmass_gpu.cu | 9 +- sph/include/sph/particles_data.hpp | 1 - sph/include/sph/particles_data_gpu.cuh | 1 - sph/include/sph/sph_gpu.hpp | 7 +- sph/include/sph/util/device_math.cuh | 40 -- sph/include/sph/util/pinned_allocator.cuh | 178 ------ 78 files changed, 1149 insertions(+), 1913 deletions(-) create mode 100644 domain/include/cstone/cuda/cub.hpp create mode 100644 domain/include/cstone/cuda/cuda_runtime.hpp create mode 100644 domain/include/cstone/traversal/groups_gpu.cu rename domain/include/cstone/traversal/{groups.cuh => groups_gpu.cuh} (71%) create mode 100644 domain/include/cstone/traversal/groups_gpu.h delete mode 100644 ryoanji/src/ryoanji/interface/treebuilder.cu create mode 100644 ryoanji/src/ryoanji/nbody/CMakeLists.txt create mode 100644 ryoanji/src/ryoanji/nbody/upsweep_gpu.cu create mode 100644 ryoanji/src/ryoanji/nbody/upsweep_gpu.h delete mode 100644 ryoanji/src/ryoanji/nbody/upwardpass.cuh create mode 100644 scripts/CMakeLists.txt create mode 100755 scripts/plot_power.py delete mode 100644 scripts/spack/package.py delete mode 100644 sph/include/sph/groups.cu delete mode 100644 sph/include/sph/util/device_math.cuh delete mode 100644 sph/include/sph/util/pinned_allocator.cuh diff --git a/.gitlab/Dockerfile_hip_2 b/.gitlab/Dockerfile_hip_2 index cd3125949..99e205b50 100644 --- a/.gitlab/Dockerfile_hip_2 +++ b/.gitlab/Dockerfile_hip_2 @@ -93,8 +93,6 @@ RUN echo \ && unset GCC_X86_64 \ && cd /usr/local/games/SPH-EXA.git \ && ls -la . \ - && parallel -j+0 hipify-perl -inplace ::: \ - `find . -name '*.h' -o -name '*.cuh' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cu'` \ && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./domain/cmake/setup_GTest.cmake \ && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./cmake/setup_GTest.cmake \ && sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./ryoanji/cmake/setup_GTest.cmake \ @@ -106,9 +104,11 @@ RUN echo \ MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \ cmake -S SPH-EXA.git -B build \ -DCMAKE_CXX_COMPILER=mpicxx -DCMAKE_C_COMPILER=mpicc \ - -DBUILD_TESTING=OFF -DBUILD_ANALYTICAL=OFF -DGPU_DIRECT=OFF \ - -DCMAKE_BUILD_TYPE=Debug -DCMAKE_HIP_ARCHITECTURES=gfx90a \ - && echo "## cmake --build + --install :" \ + -DCMAKE_HIP_COMPILER=mpicxx -DCMAKE_HIP_COMPILER_FORCED=ON \ + -DCMAKE_HIP_ARCHITECTURES=gfx90a \ + -DCSTONE_WITH_GPU_AWARE_MPI=ON \ + -DBUILD_TESTING=OFF \ + && echo "## cmake --build :" \ && MPICH_CXX=/opt/rocm-5.2.3/llvm/bin/clang++ \ MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \ cmake --build build -j `grep processor /proc/cpuinfo | wc -l` -t sphexa-hip diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bbefe5a7..689003cc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,8 @@ list(APPEND CMAKE_MODULE_PATH option(BUILD_TESTING "build unit and integration tests" ON) option(BUILD_ANALYTICAL "build analytical solution" ON) -option(GPU_DIRECT "Enable CUDA-aware MPI communication" OFF) +option(SPH_EXA_WITH_CUDA "Enable building for NVIDIA GPUs" ON) +option(SPH_EXA_WITH_HIP "Enable building for AMD GPUs" ON) set(CSTONE_DIR ${PROJECT_SOURCE_DIR}/domain/include) set(CSTONE_TEST_DIR ${PROJECT_SOURCE_DIR}/domain/test) @@ -46,18 +47,32 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -check_language(CUDA) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - find_package(CUDAToolkit) - set(CMAKE_CUDA_STANDARD 17) +if(SPH_EXA_WITH_CUDA) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) + set(CMAKE_CUDA_STANDARD 17) + else() + message(STATUS "No CUDA support") + set(SPH_EXA_WITH_CUDA OFF) + endif() +endif() + +if(SPH_EXA_WITH_HIP) + check_language(HIP) + if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER) + enable_language(HIP) + find_package(hip) + set(CMAKE_HIP_STANDARD 17) + else() + message(STATUS "No HIP support") + set(SPH_EXA_WITH_HIP OFF) + endif() endif() -check_language(HIP) -if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER) - enable_language(HIP) - find_package(hip) - set(CMAKE_HIP_STANDARD 17) +if(SPH_EXA_WITH_HIP AND SPH_EXA_WITH_CUDA) + message(FATAL_ERROR "CUDA and HIP cannot both be turned on") endif() option(SPH_EXA_WITH_H5PART "Enable HDF5 IO using the H5Part library" ON) @@ -81,15 +96,6 @@ elseif(INSITU STREQUAL "Ascent") find_package(Ascent REQUIRED PATHS "$ENV{EBROOTASCENT}/lib/cmake/ascent") endif() -option(SPH_EXA_WITH_FFTW "Enable use of the FFTW library" ON) -if (SPH_EXA_WITH_FFTW) - find_package(FFTW) - if (NOT FFTW_FOUND) - message(STATUS "No FFTW support") - set(SPH_EXA_WITH_FFTW OFF) - endif () -endif () - option(SPH_EXA_WITH_GRACKLE "Enable radiative cooling with GRACKLE" OFF) if (SPH_EXA_WITH_GRACKLE) add_subdirectory(extern/grackle) @@ -97,6 +103,7 @@ endif() add_subdirectory(domain) add_subdirectory(ryoanji) +add_subdirectory(scripts) add_subdirectory(sph) add_subdirectory(physics) add_subdirectory(main) diff --git a/README.md b/README.md index 0973f02eb..e4c46986a 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,12 @@ cd build CC=cc CXX=CC cmake -DCMAKE_CUDA_ARCHITECTURES=60 -S ``` -Module and CMake configuration on LUMI +Module and CMake configuration on LUMI (ROCm 6.2.2) ```shell -module load CrayEnv buildtools/22.12 craype-accel-amd-gfx90a rocm cray-hdf5-parallel -cd ; hipify-perl -inplace `find -name *.cu -o -name *.cuh` && find -name *.prehip -delete -cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCMAKE_HIP_COMPILER_FORCED=ON -DGPU_DIRECT= -S +module swap PrgEnv-cray PrgEnv-gnu +module load CrayEnv buildtools craype-accel-amd-gfx90a rocm cray-hdf5-parallel +cd ; +cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCSTONE_WITH_GPU_AWARE_MPI=ON -S ``` Build everything: ```make -j``` diff --git a/README_insitu.md b/README_insitu.md index 256d791e5..1ee78b96a 100644 --- a/README_insitu.md +++ b/README_insitu.md @@ -92,7 +92,6 @@ module load ParaView/5.10.1-CrayGNU-21.09-EGL -DBUILD_ANALYTICAL:BOOL=OFF \ -DBUILD_TESTING:BOOL=OFF \ -DSPH_EXA_WITH_H5PART:BOOL=OFF \ - -DSPH_EXA_WITH_FFTW:BOOL=OFF \ -DCMAKE_CXX_COMPILER=CC \ -DINSITU=Ascent diff --git a/domain/CMakeLists.txt b/domain/CMakeLists.txt index d2e468131..8a6aa808e 100644 --- a/domain/CMakeLists.txt +++ b/domain/CMakeLists.txt @@ -26,18 +26,41 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -check_language(CUDA) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - find_package(CUDAToolkit) - set(CMAKE_CUDA_STANDARD 17) +option(CSTONE_WITH_CUDA "Enable building for NVIDIA GPUs" ON) +option(CSTONE_WITH_HIP "Enable building for AMD GPUs" ON) +option(CSTONE_WITH_GPU_AWARE_MPI "Enable CUDA-aware MPI communication" OFF) + +if(GPU_DIRECT) + message(WARNING "Option GPU_DIRECT is deprecated and will be removed. Use -DCSTONE_WITH_GPU_AWARE_MPI=ON instead.") + set(CSTONE_WITH_GPU_AWARE_MPI ON) +endif() + +if(CSTONE_WITH_CUDA) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) + set(CMAKE_CUDA_STANDARD 17) + else() + message(STATUS "No CUDA support") + set(CSTONE_WITH_CUDA OFF) + endif() +endif() + +if(CSTONE_WITH_HIP) + check_language(HIP) + if(CMAKE_HIP_COMPILER) + enable_language(HIP) + find_package(hip) + set(CMAKE_HIP_STANDARD 17) + else() + message(STATUS "No HIP support") + set(CSTONE_WITH_HIP OFF) + endif() endif() -check_language(HIP) -if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER) - enable_language(HIP) - find_package(hip) - set(CMAKE_HIP_STANDARD 17) +if(CSTONE_WITH_HIP AND CSTONE_WITH_CUDA) + message(FATAL_ERROR "CUDA and HIP cannot both be turned on") endif() add_subdirectory(include) diff --git a/domain/README.md b/domain/README.md index 8b886081e..8a18f03e3 100644 --- a/domain/README.md +++ b/domain/README.md @@ -39,7 +39,7 @@ CUDA version: 11.6 or later, HIP version 5.2 or later. Example CMake invocation: ```shell -CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60,80,90 -DGPU_DIRECT= -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx +CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60;80;90 -DCSTONE_WITH_GPU_AWARE_MPI= -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx ``` GPU-direct (RDMA) MPI communication can be turned on or off by supplying `-D GPU_DIRECT=ON`. Default is `OFF`. diff --git a/domain/include/cstone/CMakeLists.txt b/domain/include/cstone/CMakeLists.txt index 24066e4b4..f5a58e76d 100644 --- a/domain/include/cstone/CMakeLists.txt +++ b/domain/include/cstone/CMakeLists.txt @@ -14,4 +14,8 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) $ $ $) + + if (CSTONE_WITH_GPU_AWARE_MPI) + target_compile_definitions(cstone_gpu INTERFACE CSTONE_HAVE_GPU_AWARE_MPI) + endif() endif () \ No newline at end of file diff --git a/domain/include/cstone/cuda/cub.hpp b/domain/include/cstone/cuda/cub.hpp new file mode 100644 index 000000000..870eeb023 --- /dev/null +++ b/domain/include/cstone/cuda/cub.hpp @@ -0,0 +1,14 @@ + +#pragma once + +#ifdef __HIPCC__ + +#include + +namespace cub = hipcub; + +#else + +#include + +#endif diff --git a/domain/include/cstone/cuda/cuda_runtime.hpp b/domain/include/cstone/cuda/cuda_runtime.hpp new file mode 100644 index 000000000..6a33bcde6 --- /dev/null +++ b/domain/include/cstone/cuda/cuda_runtime.hpp @@ -0,0 +1,61 @@ +/*! @file + * @brief CUDA/HIP runtime API compatiblity wrapper + * + * @author Sebastian Keller + */ + +#pragma once + +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) + +#include + +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventDestroy hipEventDestroy +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost hipMallocHost +#define cudaMallocManaged hipMallocManaged +#define cudaMemAttachGlobal hipMemAttachGlobal +#define cudaMemcpy hipMemcpy +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyFromSymbol hipMemcpyFromSymbol +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyToSymbol hipMemcpyToSymbol +#define cudaMemoryTypeDevice hipMemoryTypeDevice +#define cudaMemoryTypeManaged hipMemoryTypeManaged +#define cudaMemset hipMemset +#define cudaPointerAttributes hipPointerAttribute_t +#define cudaPointerGetAttributes hipPointerGetAttributes +#define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess + +#define GPU_SYMBOL HIP_SYMBOL + +#else + +#include + +#define GPU_SYMBOL(x) x + +#endif diff --git a/domain/include/cstone/cuda/cuda_utils.cuh b/domain/include/cstone/cuda/cuda_utils.cuh index 9ff7809d9..c03042b03 100644 --- a/domain/include/cstone/cuda/cuda_utils.cuh +++ b/domain/include/cstone/cuda/cuda_utils.cuh @@ -7,7 +7,8 @@ #pragma once #include -#include +#include +#include "cuda_runtime.hpp" #include "device_vector.h" #include "cuda_stubs.h" diff --git a/domain/include/cstone/cuda/device_vector.cu b/domain/include/cstone/cuda/device_vector.cu index 38082d1a9..850d6ef9b 100644 --- a/domain/include/cstone/cuda/device_vector.cu +++ b/domain/include/cstone/cuda/device_vector.cu @@ -8,6 +8,8 @@ #include #include +#include "cstone/cuda/cuda_runtime.hpp" +#include "cstone/cuda/errorcheck.cuh" #include "cstone/util/noinit_thrust.cuh" #include "device_vector.h" @@ -83,7 +85,7 @@ DeviceVector::DeviceVector(const T* first, const T* last) { auto size = last - first; impl_->resize(size); - cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice); + checkGpuErrors(cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice)); } template @@ -174,6 +176,7 @@ template class DeviceVector>; template class DeviceVector>; template class DeviceVector>; template class DeviceVector>; +template class DeviceVector>; template class DeviceVector>; template class DeviceVector>; template class DeviceVector>; diff --git a/domain/include/cstone/cuda/errorcheck.cuh b/domain/include/cstone/cuda/errorcheck.cuh index cd652942f..9784668ee 100644 --- a/domain/include/cstone/cuda/errorcheck.cuh +++ b/domain/include/cstone/cuda/errorcheck.cuh @@ -25,7 +25,7 @@ #pragma once #include -#include +#include "cuda_runtime.hpp" inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName) { diff --git a/domain/include/cstone/cuda/gpu_config.cuh b/domain/include/cstone/cuda/gpu_config.cuh index e7fd19343..f2b0713d0 100644 --- a/domain/include/cstone/cuda/gpu_config.cuh +++ b/domain/include/cstone/cuda/gpu_config.cuh @@ -32,8 +32,7 @@ #pragma once #include -#include - +#include "cstone/cuda/cuda_runtime.hpp" #include "cstone/cuda/errorcheck.cuh" namespace cstone diff --git a/domain/include/cstone/focus/rebalance_gpu.cu b/domain/include/cstone/focus/rebalance_gpu.cu index 972d074e0..d1e6587a1 100644 --- a/domain/include/cstone/focus/rebalance_gpu.cu +++ b/domain/include/cstone/focus/rebalance_gpu.cu @@ -28,8 +28,7 @@ * @author Sebastian Keller */ -#include - +#include "cstone/cuda/cub.hpp" #include "cstone/cuda/errorcheck.cuh" #include "cstone/focus/rebalance.hpp" #include "cstone/focus/rebalance_gpu.h" @@ -159,7 +158,7 @@ bool protectAncestorsGpu(const KeyType* prefixes, protectAncestorsKernel<<>>(prefixes, parents, nodeOps, numNodes); int numNodesModify; - checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, nodeOpSum, sizeof(int))); + checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, GPU_SYMBOL(nodeOpSum), sizeof(int))); return numNodesModify == 0; } @@ -197,7 +196,7 @@ ResolutionStatus enforceKeysGpu(const KeyType* forcedKeys, } int status; - checkGpuErrors(cudaMemcpyFromSymbol(&status, enforceKeyStatus_device, sizeof(ResolutionStatus))); + checkGpuErrors(cudaMemcpyFromSymbol(&status, GPU_SYMBOL(enforceKeyStatus_device), sizeof(ResolutionStatus))); return static_cast(status); } @@ -215,4 +214,4 @@ template ResolutionStatus enforceKeysGpu(const uint64_t* forcedKeys, const TreeNodeIndex* parents, TreeNodeIndex* nodeOps); -} // namespace cstone \ No newline at end of file +} // namespace cstone diff --git a/domain/include/cstone/focus/source_center_gpu.h b/domain/include/cstone/focus/source_center_gpu.h index dc573f6ad..c69e4a4e9 100644 --- a/domain/include/cstone/focus/source_center_gpu.h +++ b/domain/include/cstone/focus/source_center_gpu.h @@ -30,6 +30,7 @@ #pragma once +#include "cstone/focus/source_center.hpp" #include "cstone/tree/definitions.h" namespace cstone diff --git a/domain/include/cstone/halos/gather_halos_gpu.cu b/domain/include/cstone/halos/gather_halos_gpu.cu index 1ffb74378..f646efda0 100644 --- a/domain/include/cstone/halos/gather_halos_gpu.cu +++ b/domain/include/cstone/halos/gather_halos_gpu.cu @@ -30,6 +30,7 @@ * @author Sebastian Keller */ +#include "cstone/cuda/cuda_runtime.hpp" #include "cstone/primitives/math.hpp" #include "cstone/primitives/stl.hpp" #include "cstone/util/array.hpp" diff --git a/domain/include/cstone/primitives/mpi_cuda.cuh b/domain/include/cstone/primitives/mpi_cuda.cuh index 8e44f9067..01a647caf 100644 --- a/domain/include/cstone/primitives/mpi_cuda.cuh +++ b/domain/include/cstone/primitives/mpi_cuda.cuh @@ -32,13 +32,12 @@ #pragma once #include -#include #include "cstone/primitives/mpi_wrappers.hpp" #include "cstone/util/noinit_alloc.hpp" #include "cstone/cuda/errorcheck.cuh" -#ifdef USE_GPU_DIRECT +#ifdef CSTONE_HAVE_GPU_AWARE_MPI constexpr inline bool useGpuDirect = true; #else constexpr inline bool useGpuDirect = false; diff --git a/domain/include/cstone/primitives/primitives_gpu.cu b/domain/include/cstone/primitives/primitives_gpu.cu index 5a13a6638..1d3181cfa 100644 --- a/domain/include/cstone/primitives/primitives_gpu.cu +++ b/domain/include/cstone/primitives/primitives_gpu.cu @@ -29,8 +29,6 @@ * @author Sebastian Keller */ -#include - #include #include #include @@ -39,6 +37,7 @@ #include #include +#include "cstone/cuda/cub.hpp" #include "cstone/cuda/errorcheck.cuh" #include "cstone/primitives/math.hpp" #include "cstone/util/array.hpp" @@ -297,7 +296,7 @@ void sortByKeyGpu(KeyType* first, KeyType* last, ValueType* values, KeyType* key // Determine temporary device storage requirements void* d_tempStorage = nullptr; size_t tempStorageBytes = 0; - cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements); + checkGpuErrors(cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements)); // Allocate temporary storage checkGpuErrors(cudaMalloc(&d_tempStorage, tempStorageBytes)); diff --git a/domain/include/cstone/primitives/warpscan.cuh b/domain/include/cstone/primitives/warpscan.cuh index bf269e70c..66be99028 100644 --- a/domain/include/cstone/primitives/warpscan.cuh +++ b/domain/include/cstone/primitives/warpscan.cuh @@ -205,8 +205,6 @@ __device__ __forceinline__ GpuConfig::ThreadMask lanemask_le() */ __device__ __forceinline__ int inclusiveSegscan(int value, int distance) { - // distance should be less-equal the lane index - assert(distance <= (threadIdx.x & (GpuConfig::warpSize - 1))); #pragma unroll for (int i = 1; i < GpuConfig::warpSize; i *= 2) { @@ -242,7 +240,6 @@ __device__ __forceinline__ int inclusiveSegscanInt(const int packedValue, const // distance = number of preceding lanes to include in scanned value // e.g. if distance = 0, then no preceding lane value will be added to scannedValue int distance = countLeadingZeros(flags & lanemask_le()) + laneIdx - (GpuConfig::warpSize - 1); - assert(distance >= 0); int scannedValue = inclusiveSegscan(value, imin(distance, laneIdx)); // the lowest lane index for which packedValue was negative, warpSize if all were positive @@ -299,4 +296,13 @@ __device__ __forceinline__ int spreadSeg8(int val) return shflSync(val, laneIdx >> 3) + (laneIdx & 7); } -} // namespace cstone \ No newline at end of file +__device__ __forceinline__ float atomicMinFloat(float* addr, float value) +{ + float old; + old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) + : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); + + return old; +} + +} // namespace cstone diff --git a/domain/include/cstone/traversal/CMakeLists.txt b/domain/include/cstone/traversal/CMakeLists.txt index 55a184101..d9aa0a084 100644 --- a/domain/include/cstone/traversal/CMakeLists.txt +++ b/domain/include/cstone/traversal/CMakeLists.txt @@ -1,8 +1,8 @@ if(CMAKE_HIP_COMPILER) - set_source_files_properties(collisions_gpu.cu PROPERTIES LANGUAGE HIP) + set_source_files_properties(collisions_gpu.cu groups_gpu.cu PROPERTIES LANGUAGE HIP) endif() if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) - add_library(traversal_obj OBJECT collisions_gpu.cu) + add_library(traversal_obj OBJECT collisions_gpu.cu groups_gpu.cu) target_include_directories(traversal_obj PRIVATE ${PROJECT_SOURCE_DIR}/include) -endif() \ No newline at end of file +endif() diff --git a/domain/include/cstone/traversal/find_neighbors.cuh b/domain/include/cstone/traversal/find_neighbors.cuh index c7899197a..8c0ae7c3f 100644 --- a/domain/include/cstone/traversal/find_neighbors.cuh +++ b/domain/include/cstone/traversal/find_neighbors.cuh @@ -1,8 +1,7 @@ /* * MIT License * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/domain/include/cstone/traversal/groups_gpu.cu b/domain/include/cstone/traversal/groups_gpu.cu new file mode 100644 index 000000000..c0cf61148 --- /dev/null +++ b/domain/include/cstone/traversal/groups_gpu.cu @@ -0,0 +1,161 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*! @file + * @brief Particle target grouping + * + * @author Sebastian Keller + */ + +#include + +#include "cstone/cuda/device_vector.h" +#include "cstone/primitives/math.hpp" +#include "cstone/traversal/groups_gpu.cuh" + +namespace cstone +{ + +//! @brief simple-fixed width group targets +__global__ static void +fixedGroupsKernel(LocalIndex first, LocalIndex last, unsigned groupSize, LocalIndex* groups, unsigned numGroups) +{ + LocalIndex tid = blockIdx.x * blockDim.x + threadIdx.x; + + LocalIndex groupIdx = tid / groupSize; + if (groupIdx >= numGroups) { return; } + + if (tid == groupIdx * groupSize) + { + LocalIndex scan = first + groupIdx * groupSize; + groups[groupIdx] = scan; + if (groupIdx + 1 == numGroups) { groups[numGroups] = last; } + } +} + +void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData& groups) +{ + LocalIndex numBodies = last - first; + LocalIndex numGroups = iceil(numBodies, groupSize); + groups.data.resize(numGroups + 1); + fixedGroupsKernel<<>>(first, last, groupSize, rawPtr(groups.data), numGroups); + + groups.firstBody = first; + groups.lastBody = last; + groups.numGroups = numGroups; + groups.groupStart = rawPtr(groups.data); + groups.groupEnd = rawPtr(groups.data) + 1; +} + +//! @brief convenience wrapper for groupSplitsKernel +template +void computeGroupSplitsImpl( + LocalIndex first, + LocalIndex last, + const Tc* x, + const Tc* y, + const Tc* z, + const T* h, + const KeyType* leaves, + TreeNodeIndex numLeaves, + const LocalIndex* layout, + const Box box, + float tolFactor, + DeviceVector>& splitMasks, + DeviceVector& numSplitsPerGroup, + DeviceVector& groups) +{ + LocalIndex numParticles = last - first; + LocalIndex numFixedGroups = iceil(numParticles, groupSize); + unsigned numThreads = 256; + unsigned gridSize = numFixedGroups * GpuConfig::warpSize; + + splitMasks.resize(numFixedGroups); + + numSplitsPerGroup.reserve(numFixedGroups * 1.1); + numSplitsPerGroup.resize(numFixedGroups); + + groupSplitsKernel<<>>( + first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor, rawPtr(splitMasks), + rawPtr(numSplitsPerGroup), numFixedGroups); + + groups.reserve(numFixedGroups * 1.1); + groups.resize(numFixedGroups + 1); + exclusiveScanGpu(rawPtr(numSplitsPerGroup), rawPtr(numSplitsPerGroup) + numFixedGroups + 1, rawPtr(groups)); + LocalIndex newNumGroups; + memcpyD2H(rawPtr(groups) + groups.size() - 1, 1, &newNumGroups); + + auto& newGroupSizes = numSplitsPerGroup; + newGroupSizes.resize(newNumGroups); + + makeSplitsKernel<<>>(rawPtr(splitMasks), rawPtr(groups), numFixedGroups, + rawPtr(newGroupSizes)); + + groups.resize(newNumGroups + 1); + exclusiveScanGpu(rawPtr(newGroupSizes), rawPtr(newGroupSizes) + newNumGroups + 1, rawPtr(groups), first); + memcpyH2D(&last, 1, rawPtr(groups) + groups.size() - 1); +} + +template +void computeGroupSplits(LocalIndex first, + LocalIndex last, + const Tc* x, + const Tc* y, + const Tc* z, + const T* h, + const KeyType* leaves, + TreeNodeIndex numLeaves, + const LocalIndex* layout, + const Box box, + unsigned groupSize, + float tolFactor, + DeviceVector& numSplitsPerGroup, + DeviceVector& groups) +{ + if (groupSize == GpuConfig::warpSize) + { + DeviceVector> splitMasks; + computeGroupSplitsImpl(first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor, + splitMasks, numSplitsPerGroup, groups); + } + else if (groupSize == 2 * GpuConfig::warpSize) + { + DeviceVector> splitMasks; + computeGroupSplitsImpl<2 * GpuConfig::warpSize>(first, last, x, y, z, h, leaves, numLeaves, layout, box, + tolFactor, splitMasks, numSplitsPerGroup, groups); + } + else { throw std::runtime_error("Unsupported spatial group size\n"); } +} + +#define COMPUTE_GROUP_SPLITS(Tc, T, KeyType) \ + template void computeGroupSplits(LocalIndex first, LocalIndex last, const Tc* x, const Tc* y, const Tc* z, \ + const T* h, const KeyType* leaves, TreeNodeIndex numLeaves, \ + const LocalIndex* layout, const Box box, unsigned groupSize, float tolFactor, \ + DeviceVector& numSplitsPerGroup, DeviceVector& groups); + +COMPUTE_GROUP_SPLITS(double, double, uint64_t); +COMPUTE_GROUP_SPLITS(double, float, uint64_t); +COMPUTE_GROUP_SPLITS(float, float, uint64_t); + +} // namespace cstone diff --git a/domain/include/cstone/traversal/groups.cuh b/domain/include/cstone/traversal/groups_gpu.cuh similarity index 71% rename from domain/include/cstone/traversal/groups.cuh rename to domain/include/cstone/traversal/groups_gpu.cuh index 319af4710..4dc6c85b7 100644 --- a/domain/include/cstone/traversal/groups.cuh +++ b/domain/include/cstone/traversal/groups_gpu.cuh @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2023 CSCS, ETH Zurich, University of Basel, University of Zurich + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,56 +23,24 @@ */ /*! @file - * @brief Particle target grouping + * @brief Particle target grouping. * * @author Sebastian Keller + * + * Separate header to allow inclusion in library as well as in unit testing of device-side functions */ -#pragma once - #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/cuda/device_vector.h" #include "cstone/cuda/gpu_config.cuh" -#include "cstone/primitives/math.hpp" #include "cstone/primitives/primitives_gpu.h" #include "cstone/primitives/warpscan.cuh" #include "cstone/sfc/sfc.hpp" -#include "cstone/traversal/groups.hpp" +#include "cstone/traversal/groups_gpu.h" +#include "cstone/util/array.hpp" namespace cstone { -//! @brief simple-fixed width group targets -__global__ static void -fixedGroupsKernel(LocalIndex first, LocalIndex last, unsigned groupSize, LocalIndex* groups, unsigned numGroups) -{ - LocalIndex tid = blockIdx.x * blockDim.x + threadIdx.x; - - LocalIndex groupIdx = tid / groupSize; - if (groupIdx >= numGroups) { return; } - - if (tid == groupIdx * groupSize) - { - LocalIndex scan = first + groupIdx * groupSize; - groups[groupIdx] = scan; - if (groupIdx + 1 == numGroups) { groups[numGroups] = last; } - } -} - -static void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData& groups) -{ - LocalIndex numBodies = last - first; - LocalIndex numGroups = iceil(numBodies, groupSize); - groups.data.resize(numGroups + 1); - fixedGroupsKernel<<>>(first, last, groupSize, rawPtr(groups.data), numGroups); - - groups.firstBody = first; - groups.lastBody = last; - groups.numGroups = numGroups; - groups.groupStart = rawPtr(groups.data); - groups.groupEnd = rawPtr(groups.data) + 1; -} - /*! @brief Computes splitting patterns of target particle groups * * @tparam T float or double @@ -263,52 +231,4 @@ __global__ void groupSplitsKernel(LocalIndex first, } } -//! @brief convenience wrapper for groupSplitsKernel -template -void computeGroupSplits(LocalIndex first, - LocalIndex last, - const Tc* x, - const Tc* y, - const Tc* z, - const T* h, - const KeyType* leaves, - TreeNodeIndex numLeaves, - const LocalIndex* layout, - const Box box, - float tolFactor, - DeviceVector>& splitMasks, - DeviceVector& numSplitsPerGroup, - DeviceVector& groups) -{ - LocalIndex numParticles = last - first; - LocalIndex numFixedGroups = iceil(numParticles, groupSize); - unsigned numThreads = 256; - unsigned gridSize = numFixedGroups * GpuConfig::warpSize; - - splitMasks.resize(numFixedGroups); - - numSplitsPerGroup.reserve(numFixedGroups * 1.1); - numSplitsPerGroup.resize(numFixedGroups); - - groupSplitsKernel<<>>( - first, last, x, y, z, h, leaves, numLeaves, layout, box, tolFactor, rawPtr(splitMasks), - rawPtr(numSplitsPerGroup), numFixedGroups); - - groups.reserve(numFixedGroups * 1.1); - groups.resize(numFixedGroups + 1); - exclusiveScanGpu(rawPtr(numSplitsPerGroup), rawPtr(numSplitsPerGroup) + numFixedGroups + 1, rawPtr(groups)); - LocalIndex newNumGroups; - memcpyD2H(rawPtr(groups) + groups.size() - 1, 1, &newNumGroups); - - auto& newGroupSizes = numSplitsPerGroup; - newGroupSizes.resize(newNumGroups); - - makeSplitsKernel<<>>(rawPtr(splitMasks), rawPtr(groups), numFixedGroups, - rawPtr(newGroupSizes)); - - groups.resize(newNumGroups + 1); - exclusiveScanGpu(rawPtr(newGroupSizes), rawPtr(newGroupSizes) + newNumGroups + 1, rawPtr(groups), first); - memcpyH2D(&last, 1, rawPtr(groups) + groups.size() - 1); -} - } // namespace cstone diff --git a/domain/include/cstone/traversal/groups_gpu.h b/domain/include/cstone/traversal/groups_gpu.h new file mode 100644 index 000000000..be50bd61c --- /dev/null +++ b/domain/include/cstone/traversal/groups_gpu.h @@ -0,0 +1,90 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*! @file + * @brief Spatial target particle grouping + * + * @author Sebastian Keller + */ + +#pragma once + +#include "cstone/cuda/device_vector.h" +#include "cstone/sfc/box.hpp" +#include "cstone/traversal/groups.hpp" + +namespace cstone +{ + +/*! @brief set up fixed-size particle groups + * @param[in] first first local particle index + * @param[in] last last local particle index + * @param[in] groupSize number of particles per group + * @param[out] groups groups with fixed size @p groupSize + */ +void computeFixedGroups(LocalIndex first, LocalIndex last, unsigned groupSize, GroupData& groups); + +/*!* @brief Compute groups of particles with a maximum size and distance between consecutive particles limited + * + * @tparam groupSize integer + * @tparam Tc float or double + * @tparam T + * @tparam KeyType + * @param[in] first index of first particle in @p x,y,z,h buffers assigned to local domain + * @param[in] last index of ilast particle in @p x,y,z,h buffers assigned to local domain + * @param[in] x x coordinates + * @param[in] y y coordinates + * @param[in] z z coordinates + * @param[in] h smoothin lengths + * @param[in] leaves cornerstone leaf array, size is @numLeaves + 1 + * @param[in] numLeaves number of leaves in @p leaves + * @param[in] layout element i stores the particle index of the first particle of leaf cell i + * @param[in] box global coordinate bounding box + * @param[in] groupSize maximum number of particles per group + * @param[in] tolFactor tolerance factor for maximum distance between consecutive particles in group + * @param[-] splitMasks temporary scratch usage + * @param[-] numSplitsPerGroup temporary scratch usage + * @param[out] groups particle groups stored as indices into coordinate arrays + * + * This function first creates groups of fixed size @param groupSize. The resulting groups will be split into smaller + * groups until no distance between consecutive particles is bigger than @p tolFactor times edge length of the smallest + * leaf cell of any particle in the group. Edge length is computed as the cubic root of the cell volume. + */ +template +extern void computeGroupSplits(LocalIndex first, + LocalIndex last, + const Tc* x, + const Tc* y, + const Tc* z, + const T* h, + const KeyType* leaves, + TreeNodeIndex numLeaves, + const LocalIndex* layout, + const Box box, + unsigned groupSize, + float tolFactor, + DeviceVector& numSplitsPerGroup, + DeviceVector& groups); + +} // namespace cstone diff --git a/domain/include/cstone/tree/csarray_gpu.cu b/domain/include/cstone/tree/csarray_gpu.cu index a3cb6fdca..47c477ff6 100644 --- a/domain/include/cstone/tree/csarray_gpu.cu +++ b/domain/include/cstone/tree/csarray_gpu.cu @@ -36,6 +36,7 @@ #include #include +#include "cstone/cuda/cuda_runtime.hpp" #include "cstone/cuda/errorcheck.cuh" #include "cstone/primitives/math.hpp" #include "csarray.hpp" diff --git a/domain/test/integration_mpi/CMakeLists.txt b/domain/test/integration_mpi/CMakeLists.txt index 5aae73878..e7382b6c2 100644 --- a/domain/test/integration_mpi/CMakeLists.txt +++ b/domain/test/integration_mpi/CMakeLists.txt @@ -48,9 +48,6 @@ function(addCstoneGpuMpiTest source exename testname ranks) target_compile_definitions(${exename} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) set_target_properties(${exename} PROPERTIES LINKER_LANGUAGE CXX) endif () - if (GPU_DIRECT) - target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT) - endif () endfunction() if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) diff --git a/domain/test/integration_mpi/exchange_halos_gpu.cpp b/domain/test/integration_mpi/exchange_halos_gpu.cpp index f8066394a..0330fd553 100644 --- a/domain/test/integration_mpi/exchange_halos_gpu.cpp +++ b/domain/test/integration_mpi/exchange_halos_gpu.cpp @@ -66,7 +66,7 @@ void gpuDirect(int rank) MPI_Barrier(MPI_COMM_WORLD); } -#ifdef USE_GPU_DIRECT +#ifdef CSTONE_HAVE_GPU_AWARE_MPI TEST(HaloExchange, gpuDirect) #else TEST(HaloExchange, DISABLED_gpuDirect) diff --git a/domain/test/performance/CMakeLists.txt b/domain/test/performance/CMakeLists.txt index e2b4138b9..ed3f47747 100644 --- a/domain/test/performance/CMakeLists.txt +++ b/domain/test/performance/CMakeLists.txt @@ -7,15 +7,15 @@ cstone_add_performance_test(scan.cpp scan_perf) # only scan.cpp provides some coverage beyond the unit tests cstone_add_test(scan_perf EXECUTABLE scan_perf RANKS 1) -if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) +#if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) +if(CMAKE_CUDA_COMPILER) # disabled for HIP # uses the OpenMP implementation of thrust::sort_by_key cstone_add_performance_test(hilbert.cpp hilbert_perf) - target_compile_definitions(hilbert_perf PRIVATE THRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_OMP) if (CMAKE_CUDA_COMPILER) target_link_libraries(hilbert_perf PRIVATE CUDA::cudart) else () - target_compile_definitions(hilbert_perf PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - target_link_libraries(hilbert_perf PRIVATE hip::host) + #set_source_files_properties(hilbert.cpp PROPERTIES LANGUAGE HIP) + target_link_libraries(hilbert_perf PRIVATE OpenMP::OpenMP_CXX hip::host) endif () endif () diff --git a/domain/test/performance/neighbor_driver.cu b/domain/test/performance/neighbor_driver.cu index 3407bb7a9..0cd0da221 100644 --- a/domain/test/performance/neighbor_driver.cu +++ b/domain/test/performance/neighbor_driver.cu @@ -38,7 +38,6 @@ #include "cstone/cuda/cuda_utils.cuh" #include "cstone/cuda/thrust_util.cuh" #include "cstone/findneighbors.hpp" -#include "cstone/primitives/math.hpp" #include "cstone/traversal/find_neighbors.cuh" diff --git a/domain/test/performance/timing.cuh b/domain/test/performance/timing.cuh index 5f65f670d..bb93fd013 100644 --- a/domain/test/performance/timing.cuh +++ b/domain/test/performance/timing.cuh @@ -32,6 +32,7 @@ #pragma once #include +#include "cstone/cuda/errorcheck.cuh" #ifdef __CUDACC__ @@ -66,21 +67,21 @@ template float timeGpu(F&& f) { hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); + checkGpuErrors(hipEventCreate(&start)); + checkGpuErrors(hipEventCreate(&stop)); - hipEventRecord(start, hipStreamDefault); + checkGpuErrors(hipEventRecord(start, hipStreamDefault)); f(); - hipEventRecord(stop, hipStreamDefault); - hipEventSynchronize(stop); + checkGpuErrors(hipEventRecord(stop, hipStreamDefault)); + checkGpuErrors(hipEventSynchronize(stop)); float t0; - hipEventElapsedTime(&t0, start, stop); + checkGpuErrors(hipEventElapsedTime(&t0, start, stop)); - hipEventDestroy(start); - hipEventDestroy(stop); + checkGpuErrors(hipEventDestroy(start)); + checkGpuErrors(hipEventDestroy(stop)); return t0; } @@ -94,4 +95,4 @@ float timeCpu(F&& f) f(); auto t1 = std::chrono::high_resolution_clock::now(); return std::chrono::duration(t1 - t0).count(); -} \ No newline at end of file +} diff --git a/domain/test/unit_cuda/traversal/groups.cu b/domain/test/unit_cuda/traversal/groups.cu index ac7ba29d9..824d33a60 100644 --- a/domain/test/unit_cuda/traversal/groups.cu +++ b/domain/test/unit_cuda/traversal/groups.cu @@ -1,8 +1,7 @@ /* * MIT License * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -38,9 +37,10 @@ #include #include -#include "cstone/cuda/device_vector.h" #include "cstone/cuda/thrust_util.cuh" -#include "cstone/traversal/groups.cuh" +#include "cstone/primitives/math.hpp" +#include "cstone/traversal/groups_gpu.cuh" +#include "cstone/traversal/groups_gpu.h" #include "cstone/tree/cs_util.hpp" using namespace cstone; @@ -51,15 +51,13 @@ using SplitType = util::array; TEST(TargetGroups, t0) { - using T = double; - LocalIndex numGroups = 4, groupSize = 8, first = 4, last = 34; + LocalIndex groupSize = 8, first = 4, last = 34; - thrust::device_vector groups(numGroups + 1); - fixedGroupsKernel<<>>(first, last, groupSize, rawPtr(groups), - iceil(last - first, groupSize)); + GroupData groups; + computeFixedGroups(first, last, groupSize, groups); - thrust::host_vector hgroups = groups; - thrust::host_vector ref = std::vector{4, 12, 20, 28, 34}; + std::vector hgroups = toHost(groups.data); + std::vector ref{4, 12, 20, 28, 34}; EXPECT_EQ(hgroups, ref); } @@ -276,12 +274,11 @@ TEST(TargetGroups, groupVolumes) } { - DeviceVector S; DeviceVector temp, groups; float tolFactor = std::sqrt(3.0) / distCrit * 1.01; - computeGroupSplits(first, last, rawPtr(x), rawPtr(y), rawPtr(z), rawPtr(h), rawPtr(d_leaves), - nNodes(leaves), rawPtr(d_layout), box, tolFactor, S, temp, groups); + computeGroupSplits(first, last, rawPtr(x), rawPtr(y), rawPtr(z), rawPtr(h), rawPtr(d_leaves), nNodes(leaves), + rawPtr(d_layout), box, groupSize, tolFactor, temp, groups); std::vector h_groups = toHost(groups); EXPECT_EQ(h_groups.size(), 4); diff --git a/main/src/analytical_solutions/CMakeLists.txt b/main/src/analytical_solutions/CMakeLists.txt index e088a7d97..56370dea3 100644 --- a/main/src/analytical_solutions/CMakeLists.txt +++ b/main/src/analytical_solutions/CMakeLists.txt @@ -1,5 +1,8 @@ +# temporary fix for hip build +if(NOT DEFINED CMAKE_INSTALL_BINDIR) + set(CMAKE_INSTALL_BINDIR "bin") +endif(NOT DEFINED CMAKE_INSTALL_BINDIR) add_subdirectory(sedov_solution) -install(FILES compare_solutions.py TYPE BIN) -install(FILES compare_noh.py TYPE BIN) +install(PROGRAMS compare_noh.py compare_solutions.py DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/main/src/propagator/CMakeLists.txt b/main/src/propagator/CMakeLists.txt index 2a7a359d2..42be9843e 100644 --- a/main/src/propagator/CMakeLists.txt +++ b/main/src/propagator/CMakeLists.txt @@ -13,9 +13,6 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) ${SPH_DIR} ${RYOANJI_DIR} ${MPI_CXX_INCLUDE_PATH}) target_link_libraries(propagator_gpu PRIVATE ${MPI_CXX_LIBRARIES} cstone_gpu ryoanji sph_gpu util OpenMP::OpenMP_CXX) enableGrackle(propagator_gpu) - if (GPU_DIRECT) - target_compile_definitions(propagator_gpu PRIVATE USE_GPU_DIRECT) - endif () endif () if (CMAKE_CUDA_COMPILER) diff --git a/main/src/sphexa/CMakeLists.txt b/main/src/sphexa/CMakeLists.txt index 7617764ea..501243d92 100644 --- a/main/src/sphexa/CMakeLists.txt +++ b/main/src/sphexa/CMakeLists.txt @@ -16,12 +16,6 @@ function(enableInSituViz exename) endif() endfunction() -function(enableGpuDirect exename) - if(GPU_DIRECT) - target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT) - endif() -endfunction() - set(exename sphexa) add_executable(${exename} sphexa.cpp) target_include_directories(${exename} PRIVATE ${SPH_EXA_INCLUDE_DIRS}) @@ -40,7 +34,6 @@ if(CMAKE_CUDA_COMPILER) target_link_libraries(${exename}-cuda PRIVATE ryoanji sph_gpu io sim_init_gpu propagator_gpu observables_gpu OpenMP::OpenMP_CXX ${MPI_CXX_LIBRARIES} CUDA::cudart) enableInSituViz(${exename}-cuda) - enableGpuDirect(${exename}-cuda) enableGrackle(${exename}-cuda) install(TARGETS ${exename}-cuda RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) @@ -52,7 +45,6 @@ elseif(CMAKE_HIP_COMPILER) ${MPI_CXX_LIBRARIES} hip::host) set_target_properties(${exename}-hip PROPERTIES LINKER_LANGUAGE CXX) enableInSituViz(${exename}-hip) - enableGpuDirect(${exename}-hip) enableGrackle(${exename}-hip) install(TARGETS ${exename}-hip RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() diff --git a/main/src/util/pm_reader.hpp b/main/src/util/pm_reader.hpp index 81b1b1cbc..052982811 100644 --- a/main/src/util/pm_reader.hpp +++ b/main/src/util/pm_reader.hpp @@ -1,3 +1,33 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*! @file + * @brief Cray power measurement counter reading + * + * @author Sebastian Keller @@ -33,25 +63,26 @@ class PmReader { } - void addCounters(const std::string& pmRoot, int numRanksPerNode_) + void addCounters(const std::string& pmRoot, int numRanksPerNode) { - numRanksPerNode = numRanksPerNode_; - using CounterDesc = std::tuple>; - std::vector countersToAdd{ - {"node", pmRoot + "/energy", [](int r, int numRanksPerNode) { return r % numRanksPerNode == 0; }}, - {"acc", pmRoot + "/accel" + std::to_string(rank_ % numRanksPerNode) + "_energy", - [](int, int) { return true; }}}; - - for (auto& c : countersToAdd) + numRanksPerNode_ = numRanksPerNode; + // energy per compute node, only the first rank per node reads the node energy counter + { + std::string path = pmRoot + "/energy"; + bool enable = (rank_ % numRanksPerNode == 0) && std::filesystem::exists(path); + pmCounters.emplace_back("node", path, std::vector{}, std::vector{}, enable); + } + // energy per accelerator { - bool enable = std::filesystem::exists(get<1>(c)) && get<2>(c)(rank_, numRanksPerNode); - pmCounters.emplace_back(get<0>(c), get<1>(c), std::vector{}, std::vector{}, enable); + std::string path = pmRoot + "/accel" + std::to_string(rank_ % numRanksPerNode) + "_energy"; + bool enable = std::filesystem::exists(path); + pmCounters.emplace_back("acc", path, std::vector{}, std::vector{}, enable); } } void start() { - numStartCalled++; + numStartCalled_++; readPm(); } @@ -83,15 +114,15 @@ class PmReader ar->addStep(0, pmValues.size(), outFile + ar->suffix()); ar->stepAttribute("numRanks", &numRanks, 1); - ar->stepAttribute("numRanksPerNode", &numRanks, 1); - ar->stepAttribute("numIterations", &numStartCalled, 1); + ar->stepAttribute("numRanksPerNode", &numRanksPerNode_, 1); + ar->stepAttribute("numIterations", &numStartCalled_, 1); ar->writeField(pmName, pmValuesRebased.data(), pmValuesRebased.size()); ar->writeField(pmName + "_timeStamps", pmTimeStampsRebased.data(), pmTimeStampsRebased.size()); ar->closeStep(); pmValues.clear(); pmTimeStamps.clear(); } - numStartCalled = 0; + numStartCalled_ = 0; } private: @@ -107,7 +138,7 @@ class PmReader } } - int rank_, numRanksPerNode{0}, numStartCalled{0}; + int rank_, numRanksPerNode_{0}, numStartCalled_{0}; // name filepath counter reading time-stamp reading enabled std::vector, std::vector, bool>> pmCounters; diff --git a/ryoanji/CMakeLists.txt b/ryoanji/CMakeLists.txt index c16122971..4e17e1782 100644 --- a/ryoanji/CMakeLists.txt +++ b/ryoanji/CMakeLists.txt @@ -17,20 +17,35 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -check_language(CUDA) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - find_package(CUDAToolkit) - set(CMAKE_CUDA_STANDARD 17) -else() - message(STATUS "No CUDA support") +option(RYOANJI_WITH_CUDA "Enable building for NVIDIA GPUs" ON) +option(RYOANJI_WITH_HIP "Enable building for AMD GPUs" ON) + +if(RYOANJI_WITH_CUDA) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) + set(CMAKE_CUDA_STANDARD 17) + else() + message(STATUS "No CUDA support") + set(RYOANJI_WITH_CUDA OFF) + endif() endif() -check_language(HIP) -if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER) - enable_language(HIP) - find_package(hip) - set(CMAKE_HIP_STANDARD 17) +if(RYOANJI_WITH_HIP) + check_language(HIP) + if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER) + enable_language(HIP) + find_package(hip) + set(CMAKE_HIP_STANDARD 17) + else() + message(STATUS "No HIP support") + set(RYOANJI_WITH_HIP OFF) + endif() +endif() + +if(RYOANJI_WITH_HIP AND RYOANJI_WITH_CUDA) + message(FATAL_ERROR "CUDA and HIP cannot both be turned on") endif() add_subdirectory(src) diff --git a/ryoanji/src/ryoanji/CMakeLists.txt b/ryoanji/src/ryoanji/CMakeLists.txt index ac8e8fa7e..2d51c3d19 100644 --- a/ryoanji/src/ryoanji/CMakeLists.txt +++ b/ryoanji/src/ryoanji/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(interface) +add_subdirectory(nbody) \ No newline at end of file diff --git a/ryoanji/src/ryoanji/interface/CMakeLists.txt b/ryoanji/src/ryoanji/interface/CMakeLists.txt index 9abbd1a8e..fb23d9276 100644 --- a/ryoanji/src/ryoanji/interface/CMakeLists.txt +++ b/ryoanji/src/ryoanji/interface/CMakeLists.txt @@ -1,13 +1,8 @@ if(CMAKE_HIP_COMPILER) - set_source_files_properties(ewald.cu treebuilder.cu multipole_holder.cu PROPERTIES LANGUAGE HIP) + set_source_files_properties(ewald.cu multipole_holder.cu PROPERTIES LANGUAGE HIP) endif() if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) - add_library(cstone_tree treebuilder.cu) - target_include_directories(cstone_tree PUBLIC ${PROJECT_SOURCE_DIR}/src) - target_include_directories(cstone_tree PUBLIC ${CSTONE_DIR}) - target_link_libraries(cstone_tree cstone_gpu) - # workaround for compiling MPI with nvcc on cray systems if (MPI_CXX_INCLUDE_PATH STREQUAL "" AND DEFINED ENV{CRAY_MPICH2_DIR} OR DEFINED ENV{CRAY_MPICH_DIR}) message(STATUS "Applying MPI include workaround on CRAY") @@ -17,5 +12,5 @@ if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) add_library(ryoanji ewald.cu multipole_holder.cu) target_include_directories(ryoanji PUBLIC ${PROJECT_SOURCE_DIR}/src ${CSTONE_DIR} ${MPI_CXX_INCLUDE_PATH}) - target_link_libraries(ryoanji PUBLIC cstone_gpu ${MPI_CXX_LIBRARIES}) + target_link_libraries(ryoanji PUBLIC cstone_gpu ryoanji_kernels ${MPI_CXX_LIBRARIES}) endif() diff --git a/ryoanji/src/ryoanji/interface/ewald.cu b/ryoanji/src/ryoanji/interface/ewald.cu index c384fb224..8f7a55a12 100644 --- a/ryoanji/src/ryoanji/interface/ewald.cu +++ b/ryoanji/src/ryoanji/interface/ewald.cu @@ -4,8 +4,7 @@ * @author Sebastian Keller */ -#include - +#include "cstone/cuda/cub.hpp" #include "cstone/traversal/groups.hpp" #include "ryoanji/nbody/ewald.hpp" @@ -84,7 +83,7 @@ void computeGravityEwaldGpu(const cstone::Vec3& rootCenter, const MType& Mro computeGravityEwaldKernel<<>>(grp, x, y, z, m, G, ugrav, ax, ay, az, ewaldParamsGpu); float totalPotential; - checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalEwaldPotentialGlob, sizeof(float))); + checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalEwaldPotentialGlob), sizeof(float))); checkGpuErrors(cudaFree(ewaldParamsGpu)); *ugravTot += 0.5 * G * totalPotential; diff --git a/ryoanji/src/ryoanji/interface/multipole_holder.cu b/ryoanji/src/ryoanji/interface/multipole_holder.cu index 4e6729258..f2d0ea3f9 100644 --- a/ryoanji/src/ryoanji/interface/multipole_holder.cu +++ b/ryoanji/src/ryoanji/interface/multipole_holder.cu @@ -1,8 +1,8 @@ /* * MIT License * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,10 +34,11 @@ #include "cstone/cuda/cuda_utils.cuh" #include "cstone/cuda/thrust_util.cuh" #include "cstone/primitives/math.hpp" +#include "cstone/traversal/groups_gpu.h" #include "cstone/util/reallocate.hpp" #include "ryoanji/nbody/cartesian_qpole.hpp" #include "ryoanji/nbody/direct.cuh" -#include "ryoanji/nbody/upwardpass.cuh" +#include "ryoanji/nbody/upsweep_gpu.h" #include "ryoanji/nbody/upsweep_cpu.hpp" #include "ryoanji/nbody/traversal.cuh" #include "multipole_holder.cuh" @@ -56,13 +57,11 @@ public: const Th* h, const cstone::FocusedOctree& focusTree, const cstone::LocalIndex* layout, const cstone::Box& box) { - cstone::DeviceVector> S; - auto d_leaves = focusTree.treeLeavesAcc(); float tolFactor = 2.0f; - cstone::computeGroupSplits(first, last, x, y, z, h, d_leaves.data(), - d_leaves.size() - 1, layout, box, tolFactor, S, - traversalStack_, groups_.data); + cstone::computeGroupSplits(first, last, x, y, z, h, d_leaves.data(), d_leaves.size() - 1, layout, box, + TravConfig::targetSize, tolFactor, traversalStack_, groups_.data); + groups_.firstBody = first; groups_.lastBody = last; groups_.numGroups = groups_.data.size() - 1; @@ -75,9 +74,7 @@ public: const cstone::FocusedOctree& focusTree, const cstone::LocalIndex* layout, MType* multipoles) { - constexpr int numThreads = UpsweepConfig::numThreads; - octree_ = focusTree.octreeViewAcc(); - + octree_ = focusTree.octreeViewAcc(); resize(octree_.numLeafNodes); auto globalCenters = focusTree.globalExpansionCenters(); @@ -85,9 +82,8 @@ public: layout_ = layout; centers_ = focusTree.expansionCentersAcc().data(); - computeLeafMultipoles<<<(octree_.numLeafNodes - 1) / numThreads + 1, numThreads>>>( - x, y, z, m, octree_.leafToInternal + octree_.numInternalNodes, octree_.numLeafNodes, layout_, centers_, - rawPtr(multipoles_)); + computeLeafMultipoles(x, y, z, m, octree_.leafToInternal + octree_.numInternalNodes, octree_.numLeafNodes, + layout_, centers_, rawPtr(multipoles_)); std::vector levelRange(cstone::maxTreeLevel{} + 2); memcpyD2H(octree_.levelRange, levelRange.size(), levelRange.data()); @@ -97,11 +93,10 @@ public: for (int level = numLevels - 1; level >= 0; level--) { int numCellsLevel = levelRange[level + 1] - levelRange[level]; - int numBlocks = (numCellsLevel - 1) / numThreads + 1; if (numCellsLevel) { - upsweepMultipoles<<>>(levelRange[level], levelRange[level + 1], - octree_.childOffsets, centers_, rawPtr(multipoles_)); + upsweepMultipoles(levelRange[level], levelRange[level + 1], octree_.childOffsets, centers_, + rawPtr(multipoles_)); } } @@ -122,19 +117,12 @@ public: for (int level = numLevels - 1; level >= 0; level--) { int numCellsLevel = levelRange[level + 1] - levelRange[level]; - int numBlocks = (numCellsLevel - 1) / numThreads + 1; if (numCellsLevel) { - upsweepMultipoles<<>>(levelRange[level], levelRange[level + 1], - octree_.childOffsets, centers_, rawPtr(multipoles_)); + upsweepMultipoles(levelRange[level], levelRange[level + 1], octree_.childOffsets, centers_, + rawPtr(multipoles_)); } } - - if (IsSpherical{}) - { - normalize<<>>(octree_.numNodes, - rawPtr(multipoles_)); - } } float compute(GroupView grp, const Tc* x, const Tc* y, const Tc* z, const Tm* m, const Th* h, Tc G, int numShells, @@ -153,14 +141,14 @@ public: G, numShells, Vec3{box.lx(), box.ly(), box.lz()}, (Ta*)nullptr, ax, ay, az, (int*)rawPtr(traversalStack_)); float totalPotential; - checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalPotentialGlob, sizeof(float))); + checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalPotentialGlob), sizeof(float))); return 0.5f * Tc(G) * totalPotential; } util::array readStats() const { typename BhStats::type stats[BhStats::numStats]; - checkGpuErrors(cudaMemcpyFromSymbol(stats, bhStats, BhStats::numStats * sizeof(BhStats::type))); + checkGpuErrors(cudaMemcpyFromSymbol(stats, GPU_SYMBOL(bhStats), BhStats::numStats * sizeof(BhStats::type))); auto sumP2P = stats[BhStats::sumP2P]; auto maxP2P = stats[BhStats::maxP2P]; @@ -247,14 +235,6 @@ const MType* MultipoleHolder::deviceMultipol return impl_->deviceMultipoles(); } -#define MHOLDER_SPH(Tc, Th, Tm, Ta, Tf, KeyType, MVal) \ - template class MultipoleHolder> - -MHOLDER_SPH(double, double, double, double, double, uint64_t, double); -MHOLDER_SPH(double, double, float, double, double, uint64_t, float); -MHOLDER_SPH(double, float, float, float, double, uint64_t, float); -MHOLDER_SPH(float, float, float, float, float, uint64_t, float); - #define MHOLDER_CART(Tc, Th, Tm, Ta, Tf, KeyType, MVal) \ template class MultipoleHolder> diff --git a/ryoanji/src/ryoanji/interface/treebuilder.cu b/ryoanji/src/ryoanji/interface/treebuilder.cu deleted file mode 100644 index dec98f9f5..000000000 --- a/ryoanji/src/ryoanji/interface/treebuilder.cu +++ /dev/null @@ -1,239 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/*! @file - * @brief Build a tree for Ryoanji with the cornerstone framework - * - * @author Sebastian Keller - */ - -#include - -#include -#include -#include -#include -#include -#include - -#include "cstone/cuda/cuda_utils.cuh" -#include "cstone/cuda/thrust_util.cuh" -#include "cstone/sfc/sfc_gpu.h" -#include "cstone/tree/octree_gpu.h" -#include "cstone/tree/update_gpu.cuh" - -#include "ryoanji/nbody/types.h" -#include "ryoanji/interface/treebuilder.cuh" - -namespace ryoanji -{ - -template -class TreeBuilder::Impl -{ -public: - Impl(); - - Impl(unsigned ncrit); - - template - cstone::TreeNodeIndex update(T* x, T* y, T* z, size_t numBodies, const cstone::Box& box); - - int extract(int2* h_levelRange); - - const LocalIndex* layout() const { return rawPtr(d_layout_); } - - const TreeNodeIndex* childOffsets() const { return rawPtr(octreeGpuData_.childOffsets); } - - const TreeNodeIndex* leafToInternal() const - { - return rawPtr(octreeGpuData_.leafToInternal) + octreeGpuData_.numInternalNodes; - } - - const TreeNodeIndex* internalToLeaf() const { return rawPtr(octreeGpuData_.internalToLeaf); } - - TreeNodeIndex numLeafNodes() const { return octreeGpuData_.numLeafNodes; } - -private: - unsigned bucketSize_; - - thrust::device_vector d_tree_; - thrust::device_vector d_counts_; - - thrust::device_vector tmpTree_; - thrust::device_vector workArray_; - - cstone::OctreeData octreeGpuData_; - thrust::device_vector d_layout_; -}; - -template -TreeBuilder::Impl::Impl() - : bucketSize_(64) -{ -} - -template -TreeBuilder::Impl::Impl(unsigned ncrit) - : bucketSize_(ncrit) -{ -} - -template -template -cstone::TreeNodeIndex TreeBuilder::Impl::update(T* x, T* y, T* z, size_t numBodies, - const cstone::Box& csBox) -{ - thrust::device_vector d_keys(numBodies); - thrust::device_vector d_ordering(numBodies); - thrust::device_vector tmp(numBodies); - - cstone::computeSfcKeysGpu(x, y, z, cstone::sfcKindPointer(rawPtr(d_keys)), numBodies, csBox); - - thrust::sequence(d_ordering.begin(), d_ordering.end(), 0); - thrust::sort_by_key(thrust::device, d_keys.begin(), d_keys.end(), d_ordering.begin()); - - thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), x, tmp.begin()); - thrust::copy(tmp.begin(), tmp.end(), x); - thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), y, tmp.begin()); - thrust::copy(tmp.begin(), tmp.end(), y); - thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), z, tmp.begin()); - thrust::copy(tmp.begin(), tmp.end(), z); - - if (d_tree_.size() == 0) - { - // initial guess on first call. use previous tree as guess on subsequent calls - d_tree_ = std::vector{0, cstone::nodeRange(0)}; - d_counts_ = std::vector{unsigned(numBodies)}; - } - - while (!cstone::updateOctreeGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), bucketSize_, d_tree_, d_counts_, - tmpTree_, workArray_)) - ; - - octreeGpuData_.resize(cstone::nNodes(d_tree_)); - cstone::buildOctreeGpu(rawPtr(d_tree_), octreeGpuData_.data()); - - return octreeGpuData_.numInternalNodes + octreeGpuData_.numLeafNodes; -} - -template -int TreeBuilder::Impl::extract(int2* h_levelRange) -{ - d_layout_.resize(d_counts_.size() + 1); - thrust::copy(d_counts_.begin(), d_counts_.end(), d_layout_.begin()); - thrust::exclusive_scan(d_layout_.data(), d_layout_.data() + d_layout_.size(), d_layout_.data()); - - std::vector cs_levelRange = toHost(octreeGpuData_.levelRange); - - int numLevels = 0; - for (int level = 0; level <= cstone::maxTreeLevel{}; ++level) - { - if (cs_levelRange[level] == cs_levelRange[level + 1]) - { - numLevels = level - 1; - break; - } - - h_levelRange[level].x = cs_levelRange[level]; - h_levelRange[level].y = cs_levelRange[level + 1]; - } - - return numLevels; -} - -template -TreeBuilder::TreeBuilder() - : impl_(new Impl()) -{ -} - -template -TreeBuilder::TreeBuilder(unsigned ncrit) - : impl_(new Impl(ncrit)) -{ -} - -template -TreeBuilder::~TreeBuilder() = default; - -template -template -int TreeBuilder::update(T* x, T* y, T* z, size_t numBodies, const cstone::Box& box) -{ - return impl_->update(x, y, z, numBodies, box); -} - -template -int TreeBuilder::extract(int2* h_levelRange) -{ - return impl_->extract(h_levelRange); -} - -template -const LocalIndex* TreeBuilder::layout() const -{ - return impl_->layout(); -} - -template -const TreeNodeIndex* TreeBuilder::childOffsets() const -{ - return impl_->childOffsets(); -} - -template -const TreeNodeIndex* TreeBuilder::leafToInternal() const -{ - return impl_->leafToInternal(); -} - -template -const TreeNodeIndex* TreeBuilder::internalToLeaf() const -{ - return impl_->internalToLeaf(); -} - -template -TreeNodeIndex TreeBuilder::numLeafNodes() const -{ - return impl_->numLeafNodes(); -} - -template -unsigned TreeBuilder::maxTreeLevel() const -{ - return cstone::maxTreeLevel{}; -} - -template class TreeBuilder; -template class TreeBuilder; - -template int TreeBuilder::update(float*, float*, float*, size_t, const cstone::Box&); -template int TreeBuilder::update(float*, float*, float*, size_t, const cstone::Box&); -template int TreeBuilder::update(double*, double*, double*, size_t, const cstone::Box&); -template int TreeBuilder::update(double*, double*, double*, size_t, const cstone::Box&); - -} // namespace ryoanji diff --git a/ryoanji/src/ryoanji/interface/treebuilder.cuh b/ryoanji/src/ryoanji/interface/treebuilder.cuh index fce91f02c..9042a0a2c 100644 --- a/ryoanji/src/ryoanji/interface/treebuilder.cuh +++ b/ryoanji/src/ryoanji/interface/treebuilder.cuh @@ -31,7 +31,17 @@ #pragma once -#include +#include +#include +#include +#include + +#include "cstone/cuda/cuda_utils.cuh" +#include "cstone/cuda/thrust_util.cuh" +#include "cstone/primitives/primitives_gpu.h" +#include "cstone/sfc/sfc_gpu.h" +#include "cstone/tree/octree_gpu.h" +#include "cstone/tree/update_gpu.cuh" #include "ryoanji/nbody/types.h" @@ -42,12 +52,8 @@ template class TreeBuilder { public: - TreeBuilder(); - - ~TreeBuilder(); - //! @brief initialize with the desired maximum particles per leaf cell - TreeBuilder(unsigned ncrit); + TreeBuilder(unsigned ncrit) : bucketSize_(ncrit) {} /*! @brief construct an octree from body coordinates * @@ -62,29 +68,74 @@ public: * Note: x,y,z arrays will be sorted in SFC order to match be consistent with the cell body offsets of the tree */ template - int update(T* x, T* y, T* z, size_t numBodies, const cstone::Box& box); + int update(T* x, T* y, T* z, size_t numBodies, const cstone::Box& box) + { + thrust::device_vector d_keys(numBodies), d_keys_tmp(numBodies); + thrust::device_vector d_ordering(numBodies), d_values_tmp(numBodies); + thrust::device_vector tmp(numBodies); - /*! @brief extract the octree level range from the previous update call - * - * @param[out] h_levelRange indices of the first node at each subdivison level - * @return the maximum subdivision level in the output tree - */ - int extract(int2* h_levelRange); + cstone::computeSfcKeysGpu(x, y, z, cstone::sfcKindPointer(rawPtr(d_keys)), numBodies, box); + + cstone::sequenceGpu(rawPtr(d_ordering), d_ordering.size(), 0); + cstone::sortByKeyGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), rawPtr(d_ordering), rawPtr(d_keys_tmp), + rawPtr(d_values_tmp)); + + thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), x, tmp.begin()); + thrust::copy(tmp.begin(), tmp.end(), x); + thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), y, tmp.begin()); + thrust::copy(tmp.begin(), tmp.end(), y); + thrust::gather(thrust::device, d_ordering.begin(), d_ordering.end(), z, tmp.begin()); + thrust::copy(tmp.begin(), tmp.end(), z); + + if (d_tree_.size() == 0) + { + // initial guess on first call. use previous tree as guess on subsequent calls + d_tree_ = std::vector{0, cstone::nodeRange(0)}; + d_counts_ = std::vector{unsigned(numBodies)}; + } + + while (!cstone::updateOctreeGpu(rawPtr(d_keys), rawPtr(d_keys) + d_keys.size(), bucketSize_, d_tree_, d_counts_, + tmpTree_, workArray_)) + ; - const LocalIndex* layout() const; - const TreeNodeIndex* childOffsets() const; - const TreeNodeIndex* leafToInternal() const; - const TreeNodeIndex* internalToLeaf() const; + octreeGpuData_.resize(cstone::nNodes(d_tree_)); + cstone::buildOctreeGpu(rawPtr(d_tree_), octreeGpuData_.data()); - TreeNodeIndex numLeafNodes() const; - unsigned maxTreeLevel() const; + d_layout_.resize(d_counts_.size() + 1); + cstone::fillGpu(rawPtr(d_layout_), rawPtr(d_layout_) + 1, LocalIndex(0)); + cstone::inclusiveScanGpu(rawPtr(d_counts_), rawPtr(d_counts_) + d_counts_.size(), rawPtr(d_layout_) + 1); + + levelRange_host_ = toHost(octreeGpuData_.levelRange); + return octreeGpuData_.numInternalNodes + octreeGpuData_.numLeafNodes; + } + + const LocalIndex* layout() const { return rawPtr(d_layout_); } + const KeyType* nodeKeys() const { return rawPtr(octreeGpuData_.prefixes); } + const TreeNodeIndex* childOffsets() const { return rawPtr(octreeGpuData_.childOffsets); } + const TreeNodeIndex* leafToInternal() const + { + return cstone::leafToInternal(octreeGpuData_).data(); + } + const TreeNodeIndex* internalToLeaf() const { return rawPtr(octreeGpuData_.internalToLeaf); } + //! @brief return host-resident octree level cell ranges + const TreeNodeIndex* levelRange() const { return levelRange_host_.data(); } + + TreeNodeIndex numLeafNodes() const { return octreeGpuData_.numLeafNodes; } + unsigned maxTreeLevel() const { return cstone::maxTreeLevel{}; } private: - class Impl; - std::unique_ptr impl_; -}; + unsigned bucketSize_; -extern template class TreeBuilder; -extern template class TreeBuilder; + thrust::device_vector d_tree_; + thrust::device_vector d_counts_; + + thrust::device_vector tmpTree_; + thrust::device_vector workArray_; + + cstone::OctreeData octreeGpuData_; + thrust::device_vector d_layout_; + + std::vector levelRange_host_; +}; } // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/CMakeLists.txt b/ryoanji/src/ryoanji/nbody/CMakeLists.txt new file mode 100644 index 000000000..eedc0c5ec --- /dev/null +++ b/ryoanji/src/ryoanji/nbody/CMakeLists.txt @@ -0,0 +1,8 @@ +if(CMAKE_HIP_COMPILER) + set_source_files_properties(upsweep_gpu.cu PROPERTIES LANGUAGE HIP) +endif() + +if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) + add_library(ryoanji_kernels upsweep_gpu.cu) + target_include_directories(ryoanji_kernels PUBLIC ${PROJECT_SOURCE_DIR}/src ${CSTONE_DIR}) +endif() diff --git a/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp b/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp index ecd9c135a..671e955fa 100644 --- a/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp +++ b/ryoanji/src/ryoanji/nbody/cartesian_qpole.hpp @@ -50,11 +50,6 @@ struct IsCartesian : public stl::integral_constant -struct ExpansionOrder{}.size()> : stl::integral_constant -{ -}; - //! @brief CartesianQuadrupole index names struct Cqi { @@ -256,20 +251,4 @@ HOST_DEVICE_FUN void M2M(int begin, int end, const Vec4& Xout, const Vec4* } } -template{}, int> = 0> -HOST_DEVICE_FUN MType normalize(const MType& multipole) -{ - return multipole; -} - -template -std::ostream& operator<<(std::ostream& os, const CartesianQuadrupole& M) -{ - os << "m: " << M[Cqi::mass] << " tr: " << M[Cqi::trace] << std::endl - << "xx: " << M[Cqi::qxx] << " xy: " << M[Cqi::qxy] << " xz: " << M[Cqi::qxz] << std::endl - << "yy: " << M[Cqi::qyy] << " yz: " << M[Cqi::qyz] << " zz: " << M[Cqi::qzz] << std::endl; - - return os; -} - } // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/ewald.hpp b/ryoanji/src/ryoanji/nbody/ewald.hpp index aa9183d3d..f61d633ef 100644 --- a/ryoanji/src/ryoanji/nbody/ewald.hpp +++ b/ryoanji/src/ryoanji/nbody/ewald.hpp @@ -213,14 +213,6 @@ EwaldParameters ewaldInitParameters(const CartesianQuadrupole& Mro return params; } -template -EwaldParameters ewaldInitParameters(const SphericalMultipole& /*Mroot*/, - const Vec3& /*Mroot_center*/, int /*numReplicaShells*/, double /*L*/, - double /*lCut*/, double /*hCut*/, double /*alpha_scale*/) -{ - throw std::runtime_error("Ewald periodic gravity correction not implemented for spherical multipoles\n"); -} - //! @brief real space Ewald contribution to the potential and acceleration of a single particle @p r template HOST_DEVICE_FUN Vec4 computeEwaldRealSpace(Vec3 r, const EwaldParameters& params) diff --git a/ryoanji/src/ryoanji/nbody/kernel.hpp b/ryoanji/src/ryoanji/nbody/kernel.hpp index d992c888c..eefeef3e7 100644 --- a/ryoanji/src/ryoanji/nbody/kernel.hpp +++ b/ryoanji/src/ryoanji/nbody/kernel.hpp @@ -26,7 +26,7 @@ #pragma once /*! @file - * @brief EXA-FMM multipole kernels + * @brief Particle-2-particle (P2P) kernel * * @author Rio Yokota */ @@ -54,453 +54,6 @@ HOST_DEVICE_FUN HOST_DEVICE_INLINE double inverseSquareRoot(double x) #endif } -template -struct Index -{ - static constexpr int I = Index::I + 1; - static constexpr uint64_t F = Index::F * nz; - template - static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3& dX) - { - return Index::power(dX) * dX[2]; - } -}; - -template -struct Index -{ - static constexpr int I = Index::I + 1; - static constexpr uint64_t F = Index::F * ny; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3& dX) - { - return Index::power(dX) * dX[1]; - } -}; - -template -struct Index -{ - static constexpr int I = Index<0, 0, nx - 1>::I + 1; - static constexpr uint64_t F = Index::F * nx; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3& dX) - { - return Index::power(dX) * dX[0]; - } -}; - -template<> -struct Index<0, 0, 0> -{ - static constexpr int I = 0; - static constexpr uint64_t F = 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T power(const Vec3&) - { - return T(1.0); - } -}; - -template -struct DerivativeTerm -{ - static constexpr int c = 1 - 2 * n; - - template - static HOST_DEVICE_FUN DEVICE_INLINE void invR(T* invRN, const T& invR2) - { - DerivativeTerm::invR(invRN, invR2); - invRN[n] = c * invRN[n - 1] * invR2; - } -}; - -template<> -struct DerivativeTerm<0> -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE void invR(T*, const T&) - { - } -}; - -template -struct DerivativeSum -{ - static constexpr int cx = nx * (nx - 1) / 2; - static constexpr int cy = ny * (ny - 1) / 2; - static constexpr int cz = nz * (nz - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cx * DerivativeSum 3) * 4 + 3>::loop(invRN, dX) + - cy * DerivativeSum 3) * 2 + 5>::loop(invRN, dX) + - cz * DerivativeSum 3) + 6>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cx = nx * (nx - 1) / 2; - static constexpr int cy = ny * (ny - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cx * DerivativeSum 3) * 4 + 2>::loop(invRN, dX) + - cy * DerivativeSum 3) * 2 + 4>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cx = nx * (nx - 1) / 2; - static constexpr int cz = nz * (nz - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cx * DerivativeSum 3) * 4 + 1>::loop(invRN, dX) + - cz * DerivativeSum 3) + 4>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cx = nx * (nx - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cx * DerivativeSum 3) * 4>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cy = ny * (ny - 1) / 2; - static constexpr int cz = nz * (nz - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cy * DerivativeSum 3) * 2 + 1>::loop(invRN, dX) + - cz * DerivativeSum 3) + 2>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cy = ny * (ny - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cy * DerivativeSum 3) * 2>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int cz = nz * (nz - 1) / 2; - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d + - cz * DerivativeSum 3)>::loop(invRN, dX); - } -}; - -template -struct DerivativeSum -{ - static constexpr int n = nx + ny + nz; - static constexpr int d = depth > 0 ? depth : 1; - - template - static HOST_DEVICE_FUN DEVICE_INLINE T loop(const T* invRN, const Vec3& dX) - { - return Index::power(dX) * invRN[n + depth] / d; - } -}; - -template -struct MultipoleSum -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3& dX, const MType& M) - { - return MultipoleSum::kernel(dX, M) + Index::power(dX) / - Index::F * - M[Index::I]; - } -}; - -template -struct MultipoleSum -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3& dX, const MType& M) - { - return MultipoleSum::kernel(dX, M) + - Index::power(dX) / Index::F * M[Index::I]; - } -}; - -template -struct MultipoleSum -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3& dX, const MType& M) - { - return MultipoleSum::kernel(dX, M) + - Index::power(dX) / Index::F * M[Index::I]; - } -}; - -template -struct MultipoleSum -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE T kernel(const Vec3& dX, const MType& M) - { - return Index::power(dX) / Index::F * M[Index<0, 0, 0>::I]; - } -}; - -template -struct Kernels -{ - static constexpr int n = nx + ny + nz; - static constexpr int x = nx > 0; - static constexpr int y = ny > 0; - static constexpr int z = nz > 0; - static constexpr int flag = (nx > 1) * 4 + (ny > 1) * 2 + (nz > 1); - - template - static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3& dX) - { - Kernels::P2M(M, dX); - M[Index::I] = Index::power(dX) / Index::F * M[0]; - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3& dX, const MType& MJ) - { - Kernels::M2M(MI, dX, MJ); - MI[Index::I] += MultipoleSum::kernel(dX, MJ); - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4& TRG, T* invRN, const Vec3& dX, const MType& M) - { - Kernels::M2P(TRG, invRN, dX, M); - T C = DerivativeSum<0, nx, ny, nz, flag>::loop(invRN, dX); - TRG[0] -= M[Index::I] * C; - TRG[1] += M[Index<(nx - 1) * x, ny, nz>::I] * C * x; - TRG[2] += M[Index::I] * C * y; - TRG[3] += M[Index::I] * C * z; - } -}; - -template -struct Kernels -{ - static constexpr int n = nx + ny; - static constexpr int x = nx > 0; - static constexpr int y = ny > 0; - static constexpr int flag = (nx > 1) * 4 + (ny > 1) * 2; - - template - static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3& dX) - { - Kernels::P2M(M, dX); - M[Index::I] = Index::power(dX) / Index::F * M[0]; - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3& dX, const MType& MJ) - { - Kernels::M2M(MI, dX, MJ); - MI[Index::I] += MultipoleSum::kernel(dX, MJ); - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4& TRG, T* invRN, const Vec3& dX, const MType& M) - { - Kernels::M2P(TRG, invRN, dX, M); - T C = DerivativeSum<0, nx, ny, 0, flag>::loop(invRN, dX); - TRG[0] -= M[Index::I] * C; - TRG[1] += M[Index<(nx - 1) * x, ny, 0>::I] * C * x; - TRG[2] += M[Index::I] * C * y; - } -}; - -template -struct Kernels -{ - static constexpr int n = nx; - static constexpr int flag = (nx > 1) * 4; - - template - static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3& dX) - { - Kernels<0, 0, nx - 1>::P2M(M, dX); - M[Index::I] = Index::power(dX) / Index::F * M[0]; - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3& dX, const MType& MJ) - { - Kernels<0, 0, nx - 1>::M2M(MI, dX, MJ); - MI[Index::I] += MultipoleSum::kernel(dX, MJ); - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4& TRG, T* invRN, const Vec3 dX, const MType& M) - { - Kernels<0, 0, nx - 1>::M2P(TRG, invRN, dX, M); - const float C = DerivativeSum<0, nx, 0, 0, flag>::loop(invRN, dX); - TRG[0] -= M[Index::I] * C; - TRG[1] += M[Index::I] * C; - } -}; - -template<> -struct Kernels<0, 0, 2> -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& M, const Vec3& dX) - { - Kernels<0, 1, 1>::P2M(M, dX); - M[Index<0, 0, 2>::I] = Index<0, 0, 2>::power(dX) / Index<0, 0, 2>::F * M[0]; - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3& dX, const MType& MJ) - { - Kernels<0, 1, 1>::M2M(MI, dX, MJ); - MI[Index<0, 0, 2>::I] += MultipoleSum<0, 0, 2>::kernel(dX, MJ); - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4& TRG, T* invRN, const Vec3& dX, const MType& M) - { - TRG[0] -= invRN[0] + invRN[1] * (M[4] + M[7] + M[9]) + - invRN[2] * (M[4] * dX[0] * dX[0] + M[5] * dX[0] * dX[1] + M[6] * dX[0] * dX[2] + - M[7] * dX[1] * dX[1] + M[8] * dX[1] * dX[2] + M[9] * dX[2] * dX[2]); - TRG[1] += dX[0] * invRN[1]; - TRG[2] += dX[1] * invRN[1]; - TRG[3] += dX[2] * invRN[1]; - } -}; - -template<> -struct Kernels<0, 0, 0> -{ - template - static HOST_DEVICE_FUN DEVICE_INLINE void P2M(MType& /*M*/, const Vec3& /*dX*/) - { - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2M(MType& MI, const Vec3& dX, const MType& MJ) - { - MI[Index<0, 0, 0>::I] += MultipoleSum<0, 0, 0>::kernel(dX, MJ); - } - - template - static HOST_DEVICE_FUN DEVICE_INLINE void M2P(Vec4& TRG, T* invRN, const Vec3&, const MType&) - { - TRG[0] -= invRN[0]; - } -}; - -/*! @brief calculate multipole from particles - * - * @param[in] begin first particle index of bodyPos - * @param[in] end last particles index of bodyPos - * @param[in] Xout expansion (com) center of the output multipole - * @param[in] bodyPos body position and mass - * @param[out] Mout output multipole to add contributions to - * - */ -template{}, int> = 0> -HOST_DEVICE_FUN DEVICE_INLINE void P2M(const T* x, const T* y, const T* z, const Tm* m, int begin, int end, - const Vec4& Xout, MType& Mout) -{ - constexpr int P = ExpansionOrder{}; - - Mout = 0; - for (int i = begin; i < end; i++) - { - Vec4 body = {x[i], y[i], z[i], T(m[i])}; - Vec3 dX = makeVec3(Xout - body); - MType M; - M[0] = body[3]; - Kernels<0, 0, P - 1>::P2M(M, dX); - Mout += M; - } -} - -/*! @brief Combine multipoles into a single multipole - * - * @tparam T float or double - * @tparam MType Spherical multipole, quadrupole or octopole - * @param[in] begin first index into @p sourceCenter and @p Multipole to aggregate - * @param[in] end last index - * @param[in] Xout the expansion (com) center of the output multipole - * @param[in] Xsrc input multipole expansion (com) centers - * @param[in] Msrc input multipoles - * @param[out] Mout the aggregated output multipole - */ -template{}, int> = 0> -HOST_DEVICE_FUN DEVICE_INLINE void M2M(int begin, int end, const Vec4& Xout, const Vec4* Xsrc, const MType* Msrc, - MType& Mout) -{ - constexpr int P = ExpansionOrder{}; - - Mout = 0; - for (int i = begin; i < end; i++) - { - const MType& Mi = Msrc[i]; - Vec4 Xi = Xsrc[i]; - Vec3 dX = makeVec3(Xout - Xi); - Kernels<0, 0, P - 1>::M2M(Mout, dX, Mi); - } -} - /*! @brief interaction between two particles * * @param acc acceleration to add to @@ -534,101 +87,4 @@ HOST_DEVICE_FUN DEVICE_INLINE Vec4 P2P(Vec4 acc, const Vec3& pos_i, return acc; } -/*! @brief apply a spherial multipole to a particle - * - * @param acc acceleration to add to - * @param pos_i target particle coordinate - * @param pos_j multipole source coordinate - * @param M the multipole - * @param EPS2 plummer softening parameter - * @return input acceleration plus contribution from this call - */ -template{}, int> = 0> -HOST_DEVICE_FUN DEVICE_INLINE Vec4 M2P(Vec4 acc, const Vec3& pos_i, const Vec3& pos_j, MType& M) -{ - constexpr int P = ExpansionOrder{}; - - Vec3 dX = pos_i - pos_j; - Tc R2 = norm2(dX); - Tc invR = inverseSquareRoot(R2); - Tc invR2 = invR * invR; - - Tc invRN[P]; - invRN[0] = M[0] * invR; - DerivativeTerm

::invR(invRN, invR2); - - auto M0 = M[0]; - M[0] = 1; - Kernels<0, 0, P - 1>::M2P(acc, invRN, dX, M); - M[0] = M0; - - return acc; -} - -//! @brief computes the center of mass for the bodies in the specified range -template -HOST_DEVICE_FUN DEVICE_INLINE Vec4 setCenter(const int begin, const int end, const T* x, const T* y, const T* z, - const T* m) -{ - assert(begin <= end); - - Vec4 center{0, 0, 0, 0}; - for (int i = begin; i < end; i++) - { - T weight = m[i]; - - center[0] += weight * x[i]; - center[1] += weight * y[i]; - center[2] += weight * z[i]; - center[3] += weight; - } - - T invM = (center[3] != 0.0f) ? 1.0f / center[3] : 0.0f; - center[0] *= invM; - center[1] *= invM; - center[2] *= invM; - - return center; -} - -//! @brief computes the center of mass for the bodies in the specified range -template -HOST_DEVICE_FUN DEVICE_INLINE Vec4 setCenter(const int begin, const int end, const Vec4* posGlob) -{ - assert(begin <= end); - - Vec4 center{0, 0, 0, 0}; - for (int i = begin; i < end; i++) - { - Vec4 pos = posGlob[i]; - T weight = pos[3]; - - center[0] += weight * pos[0]; - center[1] += weight * pos[1]; - center[2] += weight * pos[2]; - center[3] += weight; - } - - T invM = (center[3] != 0.0f) ? 1.0f / center[3] : 0.0f; - center[0] *= invM; - center[1] *= invM; - center[2] *= invM; - - return center; -} - -template{}, int> = 0> -HOST_DEVICE_FUN MType normalize(const MType& multipole) -{ - using T = typename MType::value_type; - MType M = multipole; - - T mass = M[0]; - T invM = (mass != T(0.0)) ? T(1.0) / mass : T(0.0); - M *= invM; - M[0] = mass; - - return M; -} - } // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/traversal.cuh b/ryoanji/src/ryoanji/nbody/traversal.cuh index cbdf9c1ce..a22c3cf66 100644 --- a/ryoanji/src/ryoanji/nbody/traversal.cuh +++ b/ryoanji/src/ryoanji/nbody/traversal.cuh @@ -35,7 +35,7 @@ #include #include "cstone/cuda/gpu_config.cuh" #include "cstone/primitives/warpscan.cuh" -#include "cstone/traversal/groups.cuh" +#include "cstone/traversal/groups.hpp" #include "cartesian_qpole.hpp" #include "kernel.hpp" @@ -563,79 +563,4 @@ __global__ __launch_bounds__(TravConfig::numThreads) void traverse( } } -/*! @brief Compute approximate body accelerations with Barnes-Hut - * - * @param[in] firstBody index of first body in @p bodyPos to compute acceleration for - * @param[in] lastBody index (exclusive) of last body in @p bodyPos to compute acceleration for - * @param[in] x,y,z,m,h bodies, in SFC order and as referenced by sourceCells - * @param[in] G gravitational constant - * @param[in] numShells number of periodic shells in each dimension to include - * @param[in] box coordinate bounding box - * @param[inout] p body potential to add to, on device - * @param[inout] ax,ay,az body acceleration to add to - * @param[in] childOffsets location (index in [0:numTreeNodes]) of first child of each cell, 0 indicates a leaf - * @param[in] internalToLeaf for each cell in [0:numTreeNodes], stores the leaf cell (cstone) index in [0:numLeaves] - * if the cell is not a leaf, the value is negative - * @param[in] layout for each leaf cell in [0:numLeaves], stores the index of the first body in the cell - * @param[in] sourceCenter x,y,z center and square MAC radius of each cell in [0:numTreeNodes] - * @param[in] Multipole cell multipoles, on device - * @return P2P and M2P interaction statistics - */ -template -auto computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z, const Tm* m, - const Th* h, Tc G, int numShells, const cstone::Box& box, Ta* p, Ta* ax, Tc* ay, Tc* az, - const TreeNodeIndex* childOffsets, const TreeNodeIndex* internalToLeaf, - const LocalIndex* layout, const Vec4* sourceCenter, const MType* Multipole) -{ - constexpr int numWarpsPerBlock = TravConfig::numThreads / GpuConfig::warpSize; - - cstone::GroupData groups; - cstone::computeFixedGroups(firstBody, lastBody, TravConfig::targetSize, groups); - - LocalIndex numBodies = lastBody - firstBody; - int numTargets = (numBodies - 1) / TravConfig::targetSize + 1; - int numBlocks = (numTargets - 1) / numWarpsPerBlock + 1; - numBlocks = std::min(numBlocks, TravConfig::maxNumActiveBlocks); - - printf("launching %d blocks\n", numBlocks); - - const int poolSize = TravConfig::memPerWarp * numWarpsPerBlock * numBlocks; - thrust::device_vector globalPool(poolSize); - - resetTraversalCounters<<<1, 1>>>(); - auto t0 = std::chrono::high_resolution_clock::now(); - traverse<<>>( - groups.view(), 1, x, y, z, m, h, childOffsets, internalToLeaf, layout, sourceCenter, Multipole, G, numShells, - {box.lx(), box.ly(), box.lz()}, p, ax, ay, az, thrust::raw_pointer_cast(globalPool.data())); - kernelSuccess("traverse"); - - auto t1 = std::chrono::high_resolution_clock::now(); - double dt = std::chrono::duration(t1 - t0).count(); - - typename BhStats::type stats[BhStats::numStats]; - checkGpuErrors(cudaMemcpyFromSymbol(stats, bhStats, BhStats::numStats * sizeof(BhStats::type))); - - auto sumP2P = stats[BhStats::sumP2P]; - auto maxP2P = stats[BhStats::maxP2P]; - auto sumM2P = stats[BhStats::sumM2P]; - auto maxM2P = stats[BhStats::maxM2P]; - - float totalPotential; - checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, totalPotentialGlob, sizeof(float))); - - util::array interactions; - interactions[0] = Tc(sumP2P) / Tc(numBodies); - interactions[1] = Tc(maxP2P); - interactions[2] = Tc(sumM2P) / Tc(numBodies); - interactions[3] = Tc(maxM2P); - interactions[4] = totalPotential; - - Tc flops = (interactions[0] * 20.0 + interactions[2] * 2.0 * powf(ExpansionOrder{}, 3)) * - Tc(numBodies) / dt / 1e12; - - fprintf(stdout, "Traverse : %.7f s (%.7f TFlops)\n", dt, flops); - - return interactions; -} - } // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/types.h b/ryoanji/src/ryoanji/nbody/types.h index aca3acfa8..89e152b45 100644 --- a/ryoanji/src/ryoanji/nbody/types.h +++ b/ryoanji/src/ryoanji/nbody/types.h @@ -24,7 +24,7 @@ */ /*! @file - * @brief Ryoanji multipole and related types + * @brief helper types * * @author Rio Yokota */ @@ -47,43 +47,4 @@ using Vec4 = cstone::Vec4; using TreeNodeIndex = cstone::TreeNodeIndex; using LocalIndex = cstone::LocalIndex; -template -struct TermSize : public stl::integral_constant -{ -}; - -template -using SphericalMultipole = util::array{}>; - -template -struct IsSpherical - : public stl::integral_constant{} || MType{}.size() == TermSize<4>{}> -{ -}; - -template -struct ExpansionOrder -{ -}; - -template<> -struct ExpansionOrder{}> : stl::integral_constant -{ -}; - -template<> -struct ExpansionOrder{}> : stl::integral_constant -{ -}; - -template<> -struct ExpansionOrder{}> : stl::integral_constant -{ -}; - -template<> -struct ExpansionOrder{}> : stl::integral_constant -{ -}; - } // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu b/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu new file mode 100644 index 000000000..b4c970a90 --- /dev/null +++ b/ryoanji/src/ryoanji/nbody/upsweep_gpu.cu @@ -0,0 +1,115 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*! @file + * @brief Upsweep for multipole and source center computation + * + * @author Sebastian Keller + */ + +#include "cstone/cuda/cuda_runtime.hpp" +#include "cstone/primitives/math.hpp" + +#include "ryoanji/nbody/cartesian_qpole.hpp" +#include "ryoanji/nbody/kernel.hpp" + +#include "upsweep_gpu.h" + +namespace ryoanji +{ + +struct UpsweepConfig +{ + static constexpr int numThreads = 256; +}; + +template +__global__ void computeLeafMultipolesKernel(const Tc* x, const Tc* y, const Tc* z, const Tm* m, + const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves, + const LocalIndex* layout, const Vec4* centers, MType* multipoles) +{ + unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (leafIdx < numLeaves) + { + TreeNodeIndex i = leafToInternal[leafIdx]; + P2M(x, y, z, m, layout[leafIdx], layout[leafIdx + 1], centers[i], multipoles[i]); + } +} + +template +void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m, const TreeNodeIndex* leafToInternal, + TreeNodeIndex numLeaves, const LocalIndex* layout, const Vec4* centers, + MType* multipoles) +{ + constexpr int numThreads = UpsweepConfig::numThreads; + if (numLeaves) + { + computeLeafMultipolesKernel<<>>( + x, y, z, m, leafToInternal, numLeaves, layout, centers, multipoles); + } +} + +#define COMPUTE_LEAF_MULTIPOLES(Tc, Tm, Tf, MType) \ + template void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m, \ + const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves, \ + const LocalIndex* layout, const Vec4* centers, MType* multipoles) + +COMPUTE_LEAF_MULTIPOLES(double, double, double, CartesianQuadrupole); +COMPUTE_LEAF_MULTIPOLES(double, float, double, CartesianQuadrupole); +COMPUTE_LEAF_MULTIPOLES(float, float, float, CartesianQuadrupole); + +template +__global__ void upsweepMultipolesKernel(TreeNodeIndex firstCell, TreeNodeIndex lastCell, + const TreeNodeIndex* childOffsets, const Vec4* centers, MType* multipoles) +{ + const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell; + if (cellIdx >= lastCell) return; + + TreeNodeIndex firstChild = childOffsets[cellIdx]; + + // firstChild is zero if the cell is a leaf + if (firstChild) { M2M(firstChild, firstChild + 8, centers[cellIdx], centers, multipoles, multipoles[cellIdx]); } +} + +template +void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets, + const Vec4* centers, MType* multipoles) +{ + constexpr int numThreads = UpsweepConfig::numThreads; + if (lastCell > firstCell) + { + upsweepMultipolesKernel<<>>( + firstCell, lastCell, childOffsets, centers, multipoles); + } +} + +#define UPSWEEP_MULTIPOLES(T, MType) \ + template void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, \ + const TreeNodeIndex* childOffsets, const Vec4* centers, MType* multipoles) + +UPSWEEP_MULTIPOLES(double, CartesianQuadrupole); +UPSWEEP_MULTIPOLES(double, CartesianQuadrupole); +UPSWEEP_MULTIPOLES(float, CartesianQuadrupole); + +} // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/upsweep_gpu.h b/ryoanji/src/ryoanji/nbody/upsweep_gpu.h new file mode 100644 index 000000000..6f379fb6d --- /dev/null +++ b/ryoanji/src/ryoanji/nbody/upsweep_gpu.h @@ -0,0 +1,58 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*! @file + * @brief Upsweep for multipole and source center computation + * + * @author Sebastian Keller + */ + +#pragma once + +#include "ryoanji/nbody/types.h" +#include "kernel.hpp" + +namespace ryoanji +{ + +template +extern void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m, + const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves, + const LocalIndex* layout, const Vec4* centers, MType* multipoles); + +/*! @brief perform multipole upward sweep for one tree level + * + * launch config: one thread per cell of the current level + * + * @param[in] firstCell first cell to process + * @param[in] lastCell last cell to process + * @param[in] childOffsets cell index of first child of each node + * @param[in] centers source expansion (mass) centers + * @param[out] multipoles output multipole of each cell + */ +template +extern void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets, + const Vec4* centers, MType* multipoles); + +} // namespace ryoanji diff --git a/ryoanji/src/ryoanji/nbody/upwardpass.cuh b/ryoanji/src/ryoanji/nbody/upwardpass.cuh deleted file mode 100644 index 3dd145ded..000000000 --- a/ryoanji/src/ryoanji/nbody/upwardpass.cuh +++ /dev/null @@ -1,235 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/*! @file - * @brief Upsweep for multipole and source center computation - * - * @author Rio Yokota - * @author Sebastian Keller - */ - -#pragma once - -#include - -#include "cstone/cuda/gpu_config.cuh" - -#include "kernel.hpp" - -namespace ryoanji -{ - -struct UpsweepConfig -{ - static constexpr int numThreads = 256; -}; - -template -__global__ void computeLeafMultipoles(const Tc* x, const Tc* y, const Tc* z, const Tm* m, - const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves, - const LocalIndex* layout, const Vec4* centers, MType* multipoles) -{ - unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (leafIdx < numLeaves) - { - TreeNodeIndex i = leafToInternal[leafIdx]; - P2M(x, y, z, m, layout[leafIdx], layout[leafIdx + 1], centers[i], multipoles[i]); - } -} - -/*! @brief perform multipole upward sweep for one tree level - * - * launch config: one thread per cell of the current level - * - * @param[in] firstCell first cell to process - * @param[in] lastCell last cell to process - * @param[in] childOffsets cell index of first child of each node - * @param[in] centers source expansion (mass) centers - * @param[out] multipoles output multipole of each cell - */ -template -__global__ void upsweepMultipoles(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets, - const Vec4* centers, MType* multipoles) -{ - const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell; - if (cellIdx >= lastCell) return; - - TreeNodeIndex firstChild = childOffsets[cellIdx]; - - // firstChild is zero if the cell is a leaf - if (firstChild) { M2M(firstChild, firstChild + 8, centers[cellIdx], centers, multipoles, multipoles[cellIdx]); } -} - -template -__global__ void computeLeafCenters(const Tc* x, const Tc* y, const Tc* z, const Tm* m, const Th* h, - const TreeNodeIndex* leafToInternal, TreeNodeIndex numLeaves, - const LocalIndex* layout, Vec4* centers, Vec4* cellXmin, Vec4* cellXmax) -{ - unsigned leafIdx = blockIdx.x * blockDim.x + threadIdx.x; - - const Tc huge = Tc(1e10); - Vec4 Xmin{+huge, +huge, +huge, +huge}; - Vec4 Xmax{-huge, -huge, -huge, -huge}; - Vec4 center; - - if (leafIdx < numLeaves) - { - TreeNodeIndex begin = layout[leafIdx]; - TreeNodeIndex end = layout[leafIdx + 1]; - center = setCenter(begin, end, x, y, z, m); - for (int i = begin; i < end; i++) - { - Vec4 pos = {x[i], y[i], z[i], h[i]}; - Xmin = min(Xmin, pos); - Xmax = max(Xmax, pos); - } - TreeNodeIndex cellIdx = leafToInternal[leafIdx]; - centers[cellIdx] = center; - cellXmin[cellIdx] = Xmin; - cellXmax[cellIdx] = Xmax; - } -} - -/*! @brief perform source expansion center upward sweep for one tree level - * - * launch config: one thread per cell of the current level - * - * @param[in] firstCell first cell to process - * @param[in] lastCell last cell to process - * @param[in] childOffsets cell index of first child of each node - * @param[out] centers source expansion (mass) centers - * @param[out] cellXmin minimum coordinate of any body in the cell - * @param[out] cellXmax maximum coordinate of any body in the cell - */ -template -__global__ void upsweepCenters(TreeNodeIndex firstCell, TreeNodeIndex lastCell, const TreeNodeIndex* childOffsets, - Vec4* centers, Vec4* cellXmin, Vec4* cellXmax) -{ - const int cellIdx = blockIdx.x * blockDim.x + threadIdx.x + firstCell; - if (cellIdx >= lastCell) return; - - const T huge = T(1e10); - Vec4 Xmin{+huge, +huge, +huge, +huge}; - Vec4 Xmax{-huge, -huge, -huge, -huge}; - Vec4 center; - - TreeNodeIndex firstChild = childOffsets[cellIdx]; - - // firstChild is zero if the cell is a leaf - if (firstChild) - { - TreeNodeIndex begin = firstChild; - TreeNodeIndex end = firstChild + 8; - center = setCenter(begin, end, centers); - for (int i = begin; i < end; i++) - { - Xmin = min(Xmin, cellXmin[i]); - Xmax = max(Xmax, cellXmax[i]); - } - centers[cellIdx] = center; - cellXmin[cellIdx] = Xmin; - cellXmax[cellIdx] = Xmax; - } -} - -//! @brief calculate the squared MAC radius of each cell, store as the 4-th member sourceCenter -template -__global__ void setMAC(int numCells, T invTheta, Vec4* sourceCenter, const Vec4* cellXmin, - const Vec4* cellXmax) -{ - int cellIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (cellIdx >= numCells) { return; } - - Vec3 Xmin = makeVec3(cellXmin[cellIdx]); - Vec3 Xmax = makeVec3(cellXmax[cellIdx]); - T Hmax = cellXmax[cellIdx][3]; - Vec3 Xi = makeVec3(sourceCenter[cellIdx]); - Vec3 X = (Xmax + Xmin) * T(0.5); - Vec3 R = (Xmax - Xmin) * T(0.5); - Vec3 dX = X - Xi; - T s = sqrt(norm2(dX)); - T l = T(2) * max(R); - T MAC = max(l * invTheta + s, 2 * Hmax + s + l * 0.5); - T MAC2 = MAC * MAC; - - sourceCenter[cellIdx][3] = MAC2; -} - -template -__global__ void normalize(int numCells, MType* multipoles) -{ - using T = typename MType::value_type; - int cellIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (cellIdx >= numCells) { return; } - - multipoles[cellIdx] = normalize(multipoles[cellIdx]); -} - -template -void upsweep(int numSources, int numLeaves, int numLevels, T theta, const int2* levelRange, const T* x, const T* y, - const T* z, const T* m, const T* h, const LocalIndex* layout, const TreeNodeIndex* childOffsets, - const TreeNodeIndex* leafToInternal, Vec4* centers, MType* Multipole) -{ - constexpr int numThreads = UpsweepConfig::numThreads; - - thrust::device_vector> d_cellXminmax(2 * numSources); - Vec4* cellXmin = rawPtr(d_cellXminmax); - Vec4* cellXmax = cellXmin + numSources; - - auto t0 = std::chrono::high_resolution_clock::now(); - - computeLeafCenters<<<(numLeaves - 1) / numThreads + 1, numThreads>>>(x, y, z, m, h, leafToInternal, numLeaves, - layout, centers, cellXmin, cellXmax); - for (int level = numLevels - 1; level >= 1; level--) - { - int numCellsLevel = levelRange[level].y - levelRange[level].x; - int numBlocks = (numCellsLevel - 1) / numThreads + 1; - upsweepCenters<<>>(levelRange[level].x, levelRange[level].y, childOffsets, centers, - cellXmin, cellXmax); - } - - computeLeafMultipoles<<<(numLeaves - 1) / numThreads + 1, numThreads>>>(x, y, z, m, leafToInternal, numLeaves, - layout, centers, Multipole); - for (int level = numLevels - 1; level >= 1; level--) - { - int numCellsLevel = levelRange[level].y - levelRange[level].x; - int numBlocks = (numCellsLevel - 1) / numThreads + 1; - upsweepMultipoles<<>>(levelRange[level].x, levelRange[level].y, childOffsets, centers, - Multipole); - } - - kernelSuccess("upwardPass"); - - int numBlocks = (numSources - 1) / numThreads + 1; - setMAC<<>>(numSources, T(1.0) / theta, centers, cellXmin, cellXmax); - normalize<<>>(numSources, Multipole); - - auto t1 = std::chrono::high_resolution_clock::now(); - double dt = std::chrono::duration(t1 - t0).count(); - - fprintf(stdout, "Upward pass : %.7f s\n", dt); -} - -} // namespace ryoanji diff --git a/ryoanji/test/CMakeLists.txt b/ryoanji/test/CMakeLists.txt index 4c1514d93..b7e6b3684 100644 --- a/ryoanji/test/CMakeLists.txt +++ b/ryoanji/test/CMakeLists.txt @@ -10,8 +10,9 @@ endif() if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) add_executable(ryoanji_demo demo.cu) - target_link_libraries(ryoanji_demo PUBLIC cstone_tree) + target_link_libraries(ryoanji_demo PUBLIC ryoanji_kernels cstone_gpu) set_target_properties(ryoanji_demo PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_include_directories(ryoanji_demo PRIVATE ${RYOANJI_TEST_INCLUDE_DIRS}) # set_source_files_properties(demo.cu PROPERTIES COMPILE_FLAGS "-Xptxas -v") install(TARGETS ryoanji_demo RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}/ryoanji/ryoanji_demo) endif () @@ -24,10 +25,6 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) target_link_libraries(${testname} PRIVATE cstone_gpu ryoanji) - if (GPU_DIRECT) - target_compile_definitions(${testname} PRIVATE USE_GPU_DIRECT) - endif () - if (CMAKE_HIP_COMPILER) target_link_libraries(${testname} PRIVATE hip::host) target_compile_definitions(${testname} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) @@ -70,7 +67,7 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER) nbody/direct.cu test_main.cpp) target_include_directories(${testname} PRIVATE ${PROJECT_SOURCE_DIR}/src) - target_link_libraries(${testname} PUBLIC OpenMP::OpenMP_CXX cstone_tree ryoanji GTest::gtest_main) + target_link_libraries(${testname} PUBLIC OpenMP::OpenMP_CXX ryoanji GTest::gtest_main) if (CMAKE_HIP_COMPILER) set_target_properties(${testname} PROPERTIES LINKER_LANGUAGE CXX) endif () diff --git a/ryoanji/test/demo.cu b/ryoanji/test/demo.cu index fbf620943..c9a5ad422 100644 --- a/ryoanji/test/demo.cu +++ b/ryoanji/test/demo.cu @@ -1,8 +1,7 @@ /* * MIT License * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,12 +22,14 @@ * SOFTWARE. */ -#include -#include +/*! @file + * @brief Single-GPU demonstrator app for the Ryoanji N-body library + * + * @author Sebastian Keller + * @author Rio Yokota + */ -#ifdef __HIPCC__ -#include -#endif +#include #include #include @@ -36,21 +37,36 @@ #include "cstone/cuda/cuda_utils.cuh" #include "cstone/cuda/gpu_config.cuh" #include "cstone/cuda/thrust_util.cuh" +#include "cstone/focus/source_center_gpu.h" +#include "cstone/traversal/groups_gpu.h" +#include "cstone/util/array.hpp" #include "nbody/dataset.hpp" #include "ryoanji/interface/treebuilder.cuh" #include "ryoanji/nbody/types.h" #include "ryoanji/nbody/traversal.cuh" #include "ryoanji/nbody/direct.cuh" -#include "ryoanji/nbody/upwardpass.cuh" +#include "ryoanji/nbody/upsweep_gpu.h" using namespace ryoanji; +template +util::array computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z, + const Tm* m, const Th* h, Tc G, int numShells, const cstone::Box& box, Ta* p, + Ta* ax, Tc* ay, Tc* az, const TreeNodeIndex* childOffsets, + const TreeNodeIndex* internalToLeaf, const LocalIndex* layout, + const Vec4* sourceCenter, const MType* Multipole); + +template +void upsweep(int numSources, int numLeaves, int numLevels, float theta, const TreeNodeIndex* levelRange, const T* x, + const T* y, const T* z, const T* m, const cstone::Box& box, const LocalIndex* layout, + const KeyType* prefixes, const TreeNodeIndex* childOffsets, const TreeNodeIndex* leafToInternal, + Vec4* centers, MType* Multipole); + int main(int argc, char** argv) { - constexpr int P = 4; using T = float; - using MultipoleType = SphericalMultipole; + using MultipoleType = CartesianQuadrupole; int power = argc > 1 ? std::stoi(argv[1]) : 17; int directRef = argc > 2 ? std::stoi(argv[2]) : 1; @@ -65,7 +81,6 @@ int main(int argc, char** argv) fprintf(stdout, "--- BH Parameters ---------------\n"); fprintf(stdout, "numBodies : %lu\n", numBodies); - fprintf(stdout, "P : %d\n", P); fprintf(stdout, "theta : %f\n", theta); fprintf(stdout, "ncrit : %d\n", ncrit); @@ -77,17 +92,17 @@ int main(int argc, char** argv) cstone::Box box(-boxSize, boxSize); - TreeBuilder treeBuilder; + TreeBuilder treeBuilder(ncrit); int numSources = treeBuilder.update(rawPtr(d_x), rawPtr(d_y), rawPtr(d_z), numBodies, box); - std::vector levelRange(treeBuilder.maxTreeLevel() + 1); - int highestLevel = treeBuilder.extract(levelRange.data()); + const TreeNodeIndex* levelRange = treeBuilder.levelRange(); + int highestLevel = treeBuilder.maxTreeLevel(); thrust::device_vector> sourceCenter(numSources); thrust::device_vector Multipole(numSources); - upsweep(numSources, treeBuilder.numLeafNodes(), highestLevel, theta, levelRange.data(), rawPtr(d_x), rawPtr(d_y), - rawPtr(d_z), rawPtr(d_m), rawPtr(d_h), treeBuilder.layout(), treeBuilder.childOffsets(), + upsweep(numSources, treeBuilder.numLeafNodes(), highestLevel, theta, levelRange, rawPtr(d_x), rawPtr(d_y), + rawPtr(d_z), rawPtr(d_m), box, treeBuilder.layout(), treeBuilder.nodeKeys(), treeBuilder.childOffsets(), treeBuilder.leafToInternal(), rawPtr(sourceCenter), rawPtr(Multipole)); thrust::device_vector d_p(numBodies, 0), d_ax(numBodies, 0), d_ay(numBodies, 0), d_az(numBodies, 0); @@ -103,7 +118,7 @@ int main(int argc, char** argv) auto t1 = std::chrono::high_resolution_clock::now(); double dt = std::chrono::duration(t1 - t0).count(); - double flops = (interactions[0] * 20 + interactions[2] * 2 * pow(P, 3)) * numBodies / dt / 1e12; + double flops = (interactions[0] * 23 + interactions[2] * 65) * numBodies / dt / 1e12; fprintf(stdout, "--- Total runtime ----------------\n"); fprintf(stdout, "Total BH : %.7f s (%.7f TFlops)\n", dt, flops); @@ -119,7 +134,7 @@ int main(int argc, char** argv) t1 = std::chrono::high_resolution_clock::now(); dt = std::chrono::duration(t1 - t0).count(); - flops = std::pow((2 * numShells + 1), 3) * 24. * numBodies * numBodies / dt / 1e12; + flops = std::pow((2 * numShells + 1), 3) * 23. * numBodies * numBodies / dt / 1e12; fprintf(stdout, "Total Direct : %.7f s (%.7f TFlops)\n", dt, flops); thrust::host_vector h_p = d_p; @@ -165,3 +180,95 @@ int main(int argc, char** argv) return 0; } + +/*! @brief Compute approximate body accelerations with Barnes-Hut + * + * @param[in] firstBody index of first body in @p bodyPos to compute acceleration for + * @param[in] lastBody index (exclusive) of last body in @p bodyPos to compute acceleration for + * @param[in] x,y,z,m,h bodies, in SFC order and as referenced by sourceCells + * @param[in] G gravitational constant + * @param[in] numShells number of periodic shells in each dimension to include + * @param[in] box coordinate bounding box + * @param[inout] p body potential to add to, on device + * @param[inout] ax,ay,az body acceleration to add to + * @param[in] childOffsets location (index in [0:numTreeNodes]) of first child of each cell, 0 indicates a leaf + * @param[in] internalToLeaf for each cell in [0:numTreeNodes], stores the leaf cell (cstone) index in [0:numLeaves] + * if the cell is not a leaf, the value is negative + * @param[in] layout for each leaf cell in [0:numLeaves], stores the index of the first body in the cell + * @param[in] sourceCenter x,y,z center and square MAC radius of each cell in [0:numTreeNodes] + * @param[in] Multipole cell multipoles, on device + * @return P2P and M2P interaction statistics + */ +template +util::array computeAcceleration(size_t firstBody, size_t lastBody, const Tc* x, const Tc* y, const Tc* z, + const Tm* m, const Th* h, Tc G, int numShells, const cstone::Box& box, Ta* p, + Ta* ax, Tc* ay, Tc* az, const TreeNodeIndex* childOffsets, + const TreeNodeIndex* internalToLeaf, const LocalIndex* layout, + const Vec4* sourceCenter, const MType* Multipole) +{ + constexpr int numWarpsPerBlock = TravConfig::numThreads / GpuConfig::warpSize; + + cstone::GroupData groups; + cstone::computeFixedGroups(firstBody, lastBody, TravConfig::targetSize, groups); + + LocalIndex numBodies = lastBody - firstBody; + int numTargets = (numBodies - 1) / TravConfig::targetSize + 1; + int numBlocks = (numTargets - 1) / numWarpsPerBlock + 1; + numBlocks = std::min(numBlocks, TravConfig::maxNumActiveBlocks); + + printf("launching %d blocks\n", numBlocks); + + const int poolSize = TravConfig::memPerWarp * numWarpsPerBlock * numBlocks; + thrust::device_vector globalPool(poolSize); + + resetTraversalCounters<<<1, 1>>>(); + traverse<<>>( + groups.view(), 1, x, y, z, m, h, childOffsets, internalToLeaf, layout, sourceCenter, Multipole, G, numShells, + {box.lx(), box.ly(), box.lz()}, p, ax, ay, az, thrust::raw_pointer_cast(globalPool.data())); + kernelSuccess("traverse"); + + typename BhStats::type stats[BhStats::numStats]; + checkGpuErrors(cudaMemcpyFromSymbol(stats, GPU_SYMBOL(bhStats), BhStats::numStats * sizeof(BhStats::type))); + + auto sumP2P = stats[BhStats::sumP2P]; + auto maxP2P = stats[BhStats::maxP2P]; + auto sumM2P = stats[BhStats::sumM2P]; + auto maxM2P = stats[BhStats::maxM2P]; + + float totalPotential; + checkGpuErrors(cudaMemcpyFromSymbol(&totalPotential, GPU_SYMBOL(totalPotentialGlob), sizeof(float))); + + util::array interactions; + interactions[0] = Tc(sumP2P) / Tc(numBodies); + interactions[1] = Tc(maxP2P); + interactions[2] = Tc(sumM2P) / Tc(numBodies); + interactions[3] = Tc(maxM2P); + interactions[4] = totalPotential; + + return interactions; +} + +template +void upsweep(int numSources, int numLeaves, int numLevels, float theta, const TreeNodeIndex* levelRange, const T* x, + const T* y, const T* z, const T* m, const cstone::Box& box, const LocalIndex* layout, + const KeyType* prefixes, const TreeNodeIndex* childOffsets, const TreeNodeIndex* leafToInternal, + Vec4* centers, MType* Multipole) +{ + auto t0 = std::chrono::high_resolution_clock::now(); + + cstone::computeLeafSourceCenterGpu(x, y, z, m, leafToInternal, numLeaves, layout, centers); + cstone::upsweepCentersGpu(cstone::maxTreeLevel{}, levelRange, childOffsets, centers); + + computeLeafMultipoles(x, y, z, m, leafToInternal, numLeaves, layout, centers, Multipole); + for (int level = numLevels - 1; level >= 1; level--) + { + upsweepMultipoles(levelRange[level], levelRange[level + 1], childOffsets, centers, Multipole); + } + + cstone::setMacGpu(prefixes, numSources, centers, 1.f / theta, box); + + auto t1 = std::chrono::high_resolution_clock::now(); + double dt = std::chrono::duration(t1 - t0).count(); + + fprintf(stdout, "Upward pass : %.7f s\n", dt); +} diff --git a/ryoanji/test/demo_mpi.cpp b/ryoanji/test/demo_mpi.cpp index 1211f9f1f..fcf972e6d 100644 --- a/ryoanji/test/demo_mpi.cpp +++ b/ryoanji/test/demo_mpi.cpp @@ -47,8 +47,7 @@ using namespace ryoanji; template void ryoanjiTest(int thisRank, int numRanks, size_t numParticlesGlobal) { - constexpr int P = 4; - using MultipoleType = SphericalMultipole; + using MultipoleType = CartesianQuadrupole; size_t numParticles = numParticlesGlobal / numRanks; unsigned bucketSizeFocus = 64; unsigned numGlobalNodesPerRank = 100; @@ -114,7 +113,7 @@ void ryoanjiTest(int thisRank, int numRanks, size_t numParticlesGlobal) mpiAllreduce(&totalPotential, &totalPotentialGlobal, 1, MPI_SUM); auto [numP2P, maxP2P, numM2P, maxM2P, maxStack] = multipoleHolder.readStats(); - double flops = (numP2P * 23 + numM2P * 2 * pow(P, 3)) / dt / 1e12; + double flops = (numP2P * 23 + numM2P * 65) / dt / 1e12; for (int rank = 0; rank < numRanks; ++rank) { diff --git a/ryoanji/test/interface/CMakeLists.txt b/ryoanji/test/interface/CMakeLists.txt index baf0a304c..ca2fac30b 100644 --- a/ryoanji/test/interface/CMakeLists.txt +++ b/ryoanji/test/interface/CMakeLists.txt @@ -17,10 +17,6 @@ function(addRyoanjiGpuMpiTest source exename testname ranks) target_link_libraries(${exename} PRIVATE cstone_gpu ryoanji) endif() - if (GPU_DIRECT) - target_compile_definitions(${exename} PRIVATE USE_GPU_DIRECT) - endif () - if (CMAKE_HIP_COMPILER) target_link_libraries(${exename} PRIVATE hip::host) target_compile_definitions(${exename} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) diff --git a/ryoanji/test/interface/global_upsweep_gpu.cpp b/ryoanji/test/interface/global_upsweep_gpu.cpp index 4ec180057..3103b37b9 100644 --- a/ryoanji/test/interface/global_upsweep_gpu.cpp +++ b/ryoanji/test/interface/global_upsweep_gpu.cpp @@ -46,7 +46,7 @@ using namespace ryoanji; template static int multipoleHolderTest(int thisRank, int numRanks) { - using MultipoleType = SphericalMultipole; + using MultipoleType = CartesianQuadrupole; const LocalIndex numParticles = 1000 * numRanks; unsigned bucketSize = 64; unsigned bucketSizeLocal = 16; diff --git a/ryoanji/test/nbody/ewald.cu b/ryoanji/test/nbody/ewald.cu index 962d57c48..8f5afe514 100644 --- a/ryoanji/test/nbody/ewald.cu +++ b/ryoanji/test/nbody/ewald.cu @@ -12,7 +12,7 @@ #include "cstone/cuda/cuda_utils.cuh" #include "cstone/cuda/thrust_util.cuh" #include "cstone/focus/source_center.hpp" -#include "cstone/traversal/groups.cuh" +#include "cstone/traversal/groups_gpu.cu" #include "dataset.hpp" #include "ryoanji/nbody/cartesian_qpole.hpp" diff --git a/ryoanji/test/nbody/ewald_cpu.cpp b/ryoanji/test/nbody/ewald_cpu.cpp index 71ed1480e..3ed6dba46 100644 --- a/ryoanji/test/nbody/ewald_cpu.cpp +++ b/ryoanji/test/nbody/ewald_cpu.cpp @@ -47,6 +47,16 @@ const int TEST_RNG_SEED = 42; const int verbose = 0; #define V(level) if ((level) == verbose) +template +std::ostream& operator<<(std::ostream& os, const CartesianQuadrupole& M) +{ + os << "m: " << M[Cqi::mass] << " tr: " << M[Cqi::trace] << std::endl + << "xx: " << M[Cqi::qxx] << " xy: " << M[Cqi::qxy] << " xz: " << M[Cqi::qxz] << std::endl + << "yy: " << M[Cqi::qyy] << " yz: " << M[Cqi::qyz] << " zz: " << M[Cqi::qzz] << std::endl; + + return os; +} + template class GridCoordinates { @@ -184,10 +194,6 @@ makeTestTree(Coords& coordinates, cstone::Box box, float mass_scale, float th std::vector multipoles(octree.numNodes); computeLeafMultipoles(x, y, z, masses.data(), toInternal, layout.data(), centers.data(), multipoles.data()); upsweepMultipoles(octree.levelRange, octree.childOffsets.data(), centers.data(), multipoles.data()); - for (size_t i = 0; i < multipoles.size(); ++i) - { - multipoles[i] = ryoanji::normalize(multipoles[i]); - } T totalMass = std::accumulate(masses.begin(), masses.end(), 0.0); EXPECT_NEAR(totalMass, multipoles[0][ryoanji::Cqi::mass], 1e-6); diff --git a/ryoanji/test/nbody/kernel.cpp b/ryoanji/test/nbody/kernel.cpp index 29d830033..841b2c582 100644 --- a/ryoanji/test/nbody/kernel.cpp +++ b/ryoanji/test/nbody/kernel.cpp @@ -92,65 +92,3 @@ TEST(Gravity, P2PselfInteraction) auto self = P2P(acc, pos_i, pos_i, 1.0, h, h); EXPECT_EQ(self, acc); } - -TEST(Multipole, P2M) -{ - int numBodies = 1023; - - std::vector x(numBodies); - std::vector y(numBodies); - std::vector z(numBodies); - std::vector m(numBodies); - std::vector h(numBodies); - - ryoanji::makeCubeBodies(x.data(), y.data(), z.data(), m.data(), h.data(), numBodies); - - CartesianQuadrupole cartesianQuadrupole; - cstone::SourceCenterType csCenter = - cstone::massCenter(x.data(), y.data(), z.data(), m.data(), 0, numBodies); - P2M(x.data(), y.data(), z.data(), m.data(), 0, numBodies, csCenter, cartesianQuadrupole); - - Vec4 centerMass = ryoanji::setCenter(0, numBodies, x.data(), y.data(), z.data(), m.data()); - - ryoanji::SphericalMultipole multipole; - std::fill(multipole.begin(), multipole.end(), 0.0); - - ryoanji::P2M(x.data(), y.data(), z.data(), m.data(), 0, numBodies, centerMass, multipole); - - EXPECT_NEAR(multipole[0], cartesianQuadrupole[Cqi::mass], 1e-6); - - EXPECT_NEAR(centerMass[0], csCenter[0], 1e-6); - EXPECT_NEAR(centerMass[1], csCenter[1], 1e-6); - EXPECT_NEAR(centerMass[2], csCenter[2], 1e-6); - EXPECT_NEAR(centerMass[3], cartesianQuadrupole[Cqi::mass], 1e-6); - - // compare M2P results on a test target - { - Vec3 testTarget{-8, -8, -8}; - - Vec4 accM2P{0, 0, 0, 0}; - accM2P = ryoanji::M2P(accM2P, testTarget, util::makeVec3(centerMass), multipole); - // printf("test acceleration: %f %f %f %f\n", acc[0], acc[1], acc[2], acc[3]); - - // cstone is less precise - // float ax = 0; - // float ay = 0; - // float az = 0; - // cstone::multipole2particle( - // testTarget[0], testTarget[1], testTarget[2], cstoneMultipole, eps2, &ax, &ay, &az); - // printf("cstone test acceleration: %f %f %f\n", ax, ay, az); - - Vec4 accP2P{0, 0, 0, 0}; - for (int i = 0; i < numBodies; ++i) - { - accP2P = P2P(accP2P, testTarget, Vec3{x[i], y[i], z[i]}, m[i], 0.0, 0.0); - } - // printf("direct acceleration: %f %f %f\n", axd, ayd, azd); - - // compare ryoanji against the direct sum reference - EXPECT_NEAR(accM2P[0], accP2P[0], 3e-5); - EXPECT_NEAR(accM2P[1], accP2P[1], 1e-5); - EXPECT_NEAR(accM2P[2], accP2P[2], 1e-5); - EXPECT_NEAR(accM2P[3], accP2P[3], 1e-5); - } -} diff --git a/ryoanji/test/nbody/traversal_cpu.cpp b/ryoanji/test/nbody/traversal_cpu.cpp index aea8b12aa..2c40e69de 100644 --- a/ryoanji/test/nbody/traversal_cpu.cpp +++ b/ryoanji/test/nbody/traversal_cpu.cpp @@ -88,10 +88,6 @@ TEST(Gravity, TreeWalk) std::vector multipoles(octree.numNodes); computeLeafMultipoles(x, y, z, masses.data(), toInternal, layout.data(), centers.data(), multipoles.data()); upsweepMultipoles(octree.levelRange, octree.childOffsets.data(), centers.data(), multipoles.data()); - for (size_t i = 0; i < multipoles.size(); ++i) - { - multipoles[i] = ryoanji::normalize(multipoles[i]); - } T totalMass = std::accumulate(masses.begin(), masses.end(), 0.0); EXPECT_TRUE(std::abs(totalMass - multipoles[0][ryoanji::Cqi::mass]) < 1e-6); diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt new file mode 100644 index 000000000..21ce10e94 --- /dev/null +++ b/scripts/CMakeLists.txt @@ -0,0 +1,6 @@ +# temporary fix for hip build +if(NOT DEFINED CMAKE_INSTALL_BINDIR) + set(CMAKE_INSTALL_BINDIR "bin") +endif(NOT DEFINED CMAKE_INSTALL_BINDIR) + +install(PROGRAMS plot_power.py set_parms.py DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/scripts/plot_power.py b/scripts/plot_power.py new file mode 100755 index 000000000..8d5b9216b --- /dev/null +++ b/scripts/plot_power.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +""" +SPH-EXA +Copyright (c) 2024 CSCS, ETH Zurich + +Usage examples: + Run a simulation on a system that provides power counters (Cray) with profiling turned on + $ sphexa --init evrard --glass 50c.h5 -n 200 -s 10 --profile + $ plot_power.py profile.h5 +""" + +__author__ = "Sebastian Keller (sebastian.f.keller@gmail.com)" + +import numpy as np +import h5py +import sys + +import matplotlib +import matplotlib.pyplot as plt + +font = {'size': 16} +matplotlib.rc('font', **font) + + +def convertToIntervals(data): + """ Convert numRanks x numInterations x numMeasurmentsPerIterations points to intervals """ + return data[:, :, 1:] - data[:, :, :-1] + + +def convertBarToLine(data): + """ Convert a series bar plots data points to equivalent line plot data """ + return np.append(np.array([0]), np.repeat(data, 2))[:-1] + + +def removeZeros(data): + """ Remove all elements in axis 0 where the sum across axes 1 and 2 is zero """ + return data[np.sum(data, axis=(1, 2)) != 0.0] + + +def loadField(fname, what, stepnr): + """ Load /Step#stepnr/what and /Step#stepnr/what_timeStams from file and reshape into + numRanks x numIterations x numMeasurementsPerIteration + """ + h5file = h5py.File(fname, "r") + h5step = h5file["/Step#" + str(stepnr)] + numRanks = h5step.attrs["numRanks"][0] + numIterations = h5step.attrs["numIterations"][0] + raw = np.array(h5step[what]) + numPoints = len(raw) + ret = np.reshape(raw, (numRanks, numIterations, int(numPoints / numRanks / numIterations))) + return removeZeros(ret) + + +def loadEnergy(fname, what, stepnr): + """ Load what and what_timeStamps from file """ + + what_ts = what + "_timeStamps" + + dt = 1e-6 * loadField(fname, what_ts, stepnr) + # average across ranks + dt_avg = np.average(convertToIntervals(dt), axis=0) + # scan to get total time and repeat data points to get flat lines within intervals + timeline = convertBarToLine(np.cumsum(dt_avg.flatten())) + + energy = loadField(fname, what, stepnr) + # average across ranks + energy_avg = np.average(convertToIntervals(energy), axis=0) + power = (energy_avg / dt_avg) + # repeat to get flat lines within intervals + powerline = np.repeat(power.flatten(), 2) + + print("Loaded energy data \"{0}\" with dimensions".format(what), energy.shape, + "= (numCounters x numIterations x numMeasurementsPerIteration)") + + fields = {} + fields["x"] = timeline + fields["y"] = powerline + fields["numCounters"] = energy.shape[0] + + return fields + + +def plotModulePower(fname): + modulePower = loadEnergy(fname, "acc", 2) + numAccelerators = modulePower["numCounters"] + + fig = plt.figure(figsize=(15, 11)) + plt.title("SPH-EXA, power consumption, averaged over %d accelerators" % numAccelerators) + plt.xlabel("runtime [s]", fontsize=16) + plt.ylabel("average module power [W]", fontsize=16) + plt.plot(modulePower["x"], modulePower["y"], linewidth=2, color="g", label="module power") + plt.savefig("sphexa-accel-power.png", bbox_inches="tight") + + +def plotNodePower(fname): + nodePower = loadEnergy(fname, "node", 1) + numNodes = nodePower["numCounters"] + + fig = plt.figure(figsize=(15, 11)) + plt.title("SPH-EXA, power consumption, averaged over %d nodes" % numNodes) + plt.xlabel("runtime [s]", fontsize=16) + plt.ylabel("average node power [W]", fontsize=16) + + plt.axhline(y=np.max(nodePower["y"]), color="b", linestyle="--", linewidth=1, + label="%dW max measurement" % np.max(nodePower["y"])) + + simlabel = "simulation, (average power per node for a %d-node simulation)" % numNodes + plt.plot(nodePower["x"], nodePower["y"], linewidth=2, color="g", label=simlabel) + + plt.legend(loc="lower left") + plt.savefig("sphexa-node-power.png", bbox_inches="tight") + + +if __name__ == "__main__": + plotModulePower(sys.argv[1]) + plotNodePower(sys.argv[1]) diff --git a/scripts/spack/package.py b/scripts/spack/package.py deleted file mode 100644 index 231f6ec45..000000000 --- a/scripts/spack/package.py +++ /dev/null @@ -1,54 +0,0 @@ -# MIT License -# -# Copyright (c) 2021 CSCS, ETH Zurich -# 2021 University of Basel -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# ---------------------------------------------------------------------------- - -from spack import * - - -class SphExaMiniApp(CMakePackage, CudaPackage): - """ - The SPH-EXA mini-app is a scientific research code; its goal is to scale - the Smoothed Particle Hydrodynamics (SPH) method to enable Tier-0 and - Exascale simulations: https://hpc.dmi.unibas.ch/en/research/sph-exa/ - Example: spack install sph-exa-mini-app+cuda %gcc@9.3.0.21.05 - """ - homepage = "https://github.com/unibas-dmi-hpc/SPH-EXA_mini-app" - git = "https://github.com/unibas-dmi-hpc/SPH-EXA_mini-app.git" - maintainers = ['jgphpc', 'sebkelle1'] - version('develop', branch='develop', submodules=True, preferred=True) - depends_on('cmake@3.20.2:', type='build') - depends_on('mpi') - depends_on('cuda@10:', when='+cuda') - variant('mpi', default=True, description='Enable MPI support') - variant('cuda', default=False, description='Enable CUDA support') - sanity_check_is_file = [join_path('bin', 'sedov')] - - def cmake_args(self): - # Add arguments other than CMAKE_INSTALL_PREFIX and CMAKE_BUILD_TYPE - args = ['-DCMAKE_VERBOSE_MAKEFILE=ON'] - args.append(f'-DCMAKE_CXX_COMPILER={self.compiler.cxx}') - # -DUSE_CUDA set in src/sedov/CMakeLists.txt target_compile_definitions - if '+cuda' in self.spec: - sanity_check_is_file = [join_path('bin', 'sedov-cuda')] - - return args diff --git a/sph/include/sph/CMakeLists.txt b/sph/include/sph/CMakeLists.txt index 76d68beca..64ec7e99c 100644 --- a/sph/include/sph/CMakeLists.txt +++ b/sph/include/sph/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(hydro_turb) add_subdirectory(hydro_ve) set(TIMESTEP_GPU_KERNELS - groups.cu positions_gpu.cu ts_groups.cu update_h_gpu.cu + positions_gpu.cu ts_groups.cu update_h_gpu.cu ) if (CMAKE_HIP_COMPILER) diff --git a/sph/include/sph/groups.cu b/sph/include/sph/groups.cu deleted file mode 100644 index 06e231e22..000000000 --- a/sph/include/sph/groups.cu +++ /dev/null @@ -1,52 +0,0 @@ -/*! @file - * @brief Compute groups of target particles that a) fit in a warp and b) have compact bounding boxes - * - * @author Sebastian Keller - */ - -#include "thrust/device_vector.h" - -#include "cstone/traversal/find_neighbors.cuh" -#include "cstone/traversal/groups.cuh" - -#include "sph/sph_gpu.hpp" -#include "sph/particles_data.hpp" - -namespace sph -{ - -using cstone::GpuConfig; -using cstone::TravConfig; - -/*! @brief Compute groups of SFC-consecutive particles with compact bounding volumes - * - * @tparam Dataset - * @param[in] startIndex first particle index to include in groups - * @param[in] endIndex last particle index in include in groups - * @param[in] d Dataset with particle x,y,z,h arrays - * @param[in] box global coordinate bounding box - * @param[out] groups output particle groups - */ -template -void computeSpatialGroups(size_t startIndex, size_t endIndex, Dataset& d, - const cstone::Box& box, GroupData& groups) -{ - cstone::DeviceVector> S; - - float tolFactor = 2.0f; - cstone::computeGroupSplits(startIndex, endIndex, rawPtr(d.devData.x), rawPtr(d.devData.y), - rawPtr(d.devData.z), rawPtr(d.devData.h), d.treeView.leaves, - d.treeView.numLeafNodes, d.treeView.layout, box, tolFactor, S, - d.devData.traversalStack, groups.data); - - groups.firstBody = startIndex; - groups.lastBody = endIndex; - groups.numGroups = groups.data.size() - 1; - groups.groupStart = rawPtr(groups.data); - groups.groupEnd = rawPtr(groups.data) + 1; -} - -template void computeSpatialGroups(size_t, size_t, sphexa::ParticlesData&, - const cstone::Box&, GroupData&); - -} // namespace sph diff --git a/sph/include/sph/groups.hpp b/sph/include/sph/groups.hpp index e55eef642..c6e4b12b9 100644 --- a/sph/include/sph/groups.hpp +++ b/sph/include/sph/groups.hpp @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /*! @file * @brief Target particle group configuration * @@ -8,11 +32,28 @@ #include "cstone/sfc/box.hpp" #include "cstone/primitives/primitives_gpu.h" +#include "cstone/traversal/groups_gpu.h" #include "sph/sph_gpu.hpp" namespace sph { +template +void computeSpatialGroups(cstone::LocalIndex startIndex, cstone::LocalIndex endIndex, Dataset& d, + const cstone::Box& box, GroupData& groups) +{ + float tolFactor = 2.0f; + cstone::computeGroupSplits(startIndex, endIndex, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), + rawPtr(d.devData.h), d.treeView.leaves, d.treeView.numLeafNodes, d.treeView.layout, box, + nsGroupSize(), tolFactor, d.devData.traversalStack, groups.data); + + groups.firstBody = startIndex; + groups.lastBody = endIndex; + groups.numGroups = groups.data.size() - 1; + groups.groupStart = rawPtr(groups.data); + groups.groupEnd = rawPtr(groups.data) + 1; +} + //! @brief Compute spatial (=SFC-consecutive) groups of particles with compact bounding boxes template void computeGroups(size_t startIndex, size_t endIndex, Dataset& d, const cstone::Box& box, diff --git a/sph/include/sph/hydro_std/iad_gpu.cu b/sph/include/sph/hydro_std/iad_gpu.cu index 3d4b97310..b440a932f 100644 --- a/sph/include/sph/hydro_std/iad_gpu.cu +++ b/sph/include/sph/hydro_std/iad_gpu.cu @@ -31,7 +31,6 @@ */ #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/findneighbors.hpp" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" diff --git a/sph/include/sph/hydro_std/momentum_energy_gpu.cu b/sph/include/sph/hydro_std/momentum_energy_gpu.cu index 541283783..81d624857 100644 --- a/sph/include/sph/hydro_std/momentum_energy_gpu.cu +++ b/sph/include/sph/hydro_std/momentum_energy_gpu.cu @@ -29,15 +29,13 @@ * @author Sebastian Keller */ -#include - +#include "cstone/cuda/cub.hpp" #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/findneighbors.hpp" +#include "cstone/primitives/warpscan.cuh" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" #include "sph/particles_data.hpp" -#include "sph/util/device_math.cuh" #include "sph/hydro_std/momentum_energy_kern.hpp" namespace sph @@ -99,7 +97,7 @@ __global__ void cudaGradP(Tc K, Tc Kcour, unsigned ngmax, cstone::Box box, c T blockMin = reduce.Reduce(dt_i, cub::Min()); __syncthreads(); - if (threadIdx.x == 0) { atomicMinFloat(&minDt_device, blockMin); } + if (threadIdx.x == 0) { cstone::atomicMinFloat(&minDt_device, blockMin); } } template @@ -109,7 +107,7 @@ void computeMomentumEnergyStdGpu(const GroupView& grp, Dataset& d, const cstone: cstone::resetTraversalCounters<<<1, 1>>>(); float huge = 1e10; - checkGpuErrors(cudaMemcpyToSymbol(minDt_device, &huge, sizeof(huge))); + checkGpuErrors(cudaMemcpyToSymbol(GPU_SYMBOL(minDt_device), &huge, sizeof(huge))); cstone::resetTraversalCounters<<<1, 1>>>(); cudaGradP<<>>( @@ -123,7 +121,7 @@ void computeMomentumEnergyStdGpu(const GroupView& grp, Dataset& d, const cstone: checkGpuErrors(cudaGetLastError()); float minDt; - checkGpuErrors(cudaMemcpyFromSymbol(&minDt, minDt_device, sizeof(minDt))); + checkGpuErrors(cudaMemcpyFromSymbol(&minDt, GPU_SYMBOL(minDt_device), sizeof(minDt))); d.minDtCourant = minDt; } diff --git a/sph/include/sph/hydro_ve/av_switches_gpu.cu b/sph/include/sph/hydro_ve/av_switches_gpu.cu index 929ebdb18..90376677e 100644 --- a/sph/include/sph/hydro_ve/av_switches_gpu.cu +++ b/sph/include/sph/hydro_ve/av_switches_gpu.cu @@ -30,7 +30,6 @@ */ #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/findneighbors.hpp" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" diff --git a/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu b/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu index ad7384252..5f34ef5c1 100644 --- a/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu +++ b/sph/include/sph/hydro_ve/iad_divv_curlv_gpu.cu @@ -30,7 +30,6 @@ */ #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/findneighbors.hpp" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" diff --git a/sph/include/sph/hydro_ve/momentum_energy_gpu.cu b/sph/include/sph/hydro_ve/momentum_energy_gpu.cu index b86733368..2b81b0737 100644 --- a/sph/include/sph/hydro_ve/momentum_energy_gpu.cu +++ b/sph/include/sph/hydro_ve/momentum_energy_gpu.cu @@ -29,14 +29,13 @@ * @author Sebastian Keller */ -#include - +#include "cstone/cuda/cub.hpp" #include "cstone/cuda/cuda_utils.cuh" +#include "cstone/primitives/warpscan.cuh" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" #include "sph/particles_data.hpp" -#include "sph/util/device_math.cuh" #include "sph/hydro_ve/momentum_energy_kern.hpp" namespace sph @@ -114,7 +113,7 @@ __global__ void momentumEnergyGpu(Tc K, Tc Kcour, T Atmin, T Atmax, T ramp, unsi T blockMin = reduce.Reduce(dt_i, cub::Min()); __syncthreads(); - if (threadIdx.x == 0) { atomicMinFloat(&minDt_ve_device, blockMin); } + if (threadIdx.x == 0) { cstone::atomicMinFloat(&minDt_ve_device, blockMin); } } template @@ -124,7 +123,7 @@ void computeMomentumEnergy(const GroupView& grp, float* groupDt, Dataset& d, auto [traversalPool, nidxPool] = cstone::allocateNcStacks(d.devData.traversalStack, d.ngmax); float huge = 1e10; - checkGpuErrors(cudaMemcpyToSymbol(minDt_ve_device, &huge, sizeof(huge))); + checkGpuErrors(cudaMemcpyToSymbol(GPU_SYMBOL(minDt_ve_device), &huge, sizeof(huge))); cstone::resetTraversalCounters<<<1, 1>>>(); momentumEnergyGpu<<>>( @@ -140,7 +139,7 @@ void computeMomentumEnergy(const GroupView& grp, float* groupDt, Dataset& d, checkGpuErrors(cudaGetLastError()); float minDt; - checkGpuErrors(cudaMemcpyFromSymbol(&minDt, minDt_ve_device, sizeof(minDt))); + checkGpuErrors(cudaMemcpyFromSymbol(&minDt, GPU_SYMBOL(minDt_ve_device), sizeof(minDt))); d.minDtCourant = minDt; } diff --git a/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu b/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu index 38916b97d..b9f622480 100644 --- a/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu +++ b/sph/include/sph/hydro_ve/ve_def_gradh_gpu.cu @@ -30,7 +30,6 @@ */ #include "cstone/cuda/cuda_utils.cuh" -#include "cstone/findneighbors.hpp" #include "cstone/traversal/find_neighbors.cuh" #include "sph/sph_gpu.hpp" diff --git a/sph/include/sph/hydro_ve/xmass_gpu.cu b/sph/include/sph/hydro_ve/xmass_gpu.cu index cdbfa2115..c908fa2d6 100644 --- a/sph/include/sph/hydro_ve/xmass_gpu.cu +++ b/sph/include/sph/hydro_ve/xmass_gpu.cu @@ -1,8 +1,7 @@ /* * MIT License * - * Copyright (c) 2021 CSCS, ETH Zurich - * 2021 University of Basel + * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,6 +46,8 @@ using cstone::NcStats; using cstone::TravConfig; using cstone::TreeNodeIndex; +unsigned nsGroupSize() { return TravConfig::targetSize; } + namespace cuda { @@ -113,10 +114,10 @@ void computeXMass(const GroupView& grp, Dataset& d, const cstone::Box #include #include "cstone/cuda/cuda_utils.cuh" diff --git a/sph/include/sph/sph_gpu.hpp b/sph/include/sph/sph_gpu.hpp index c5f92f888..9298ba734 100644 --- a/sph/include/sph/sph_gpu.hpp +++ b/sph/include/sph/sph_gpu.hpp @@ -12,10 +12,6 @@ namespace sph using cstone::GroupData; using cstone::GroupView; -template -extern void computeSpatialGroups(size_t, size_t, Dataset& d, const cstone::Box&, - GroupData&); - template extern void computeIADGpu(const GroupView&, Dataset& d, const cstone::Box&); @@ -76,4 +72,7 @@ extern void groupAccTimestepGpu(float etaAcc, const GroupView&, const T* ax, con void storeRungGpu(const GroupView& grp, uint8_t rung, uint8_t* particleRungs); +//! @brief max number of particles per group used in neighbor search for SPH +unsigned nsGroupSize(); + } // namespace sph diff --git a/sph/include/sph/util/device_math.cuh b/sph/include/sph/util/device_math.cuh deleted file mode 100644 index e7a085495..000000000 --- a/sph/include/sph/util/device_math.cuh +++ /dev/null @@ -1,40 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2022 CSCS, ETH Zurich - * 2022 University of Basel - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/*! @file - * @brief GPU specific math functions - * @author Sebastian Keller - */ - -#pragma once - -__device__ __forceinline__ float atomicMinFloat(float* addr, float value) -{ - float old; - old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) - : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); - - return old; -} \ No newline at end of file diff --git a/sph/include/sph/util/pinned_allocator.cuh b/sph/include/sph/util/pinned_allocator.cuh deleted file mode 100644 index 283c6ad94..000000000 --- a/sph/include/sph/util/pinned_allocator.cuh +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright 2008-2013 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! @file - * \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost - */ - -#pragma once - -#include -#include -#include - -#include - -/*! @brief pinned_allocator is a CUDA-specific host memory allocator - * that employs @c cudaMallocHost for allocation. - * - * @see http://www.sgi.com/tech/stl/Allocators.html - */ -template -class pinned_allocator; - -template<> -class pinned_allocator -{ -public: - typedef void value_type; - typedef void* pointer; - typedef const void* const_pointer; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - // convert a pinned_allocator to pinned_allocator - template - struct rebind - { - typedef pinned_allocator other; - }; // end rebind -}; // end pinned_allocator - -template -class pinned_allocator -{ -public: - //! \{ - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - //! \} - - // convert a pinned_allocator to pinned_allocator - template - struct rebind - { - typedef pinned_allocator other; - }; // end rebind - - /*! \p pinned_allocator's null constructor does nothing. - */ - inline pinned_allocator() {} - - /*! \p pinned_allocator's null destructor does nothing. - */ - inline ~pinned_allocator() {} - - /*! \p pinned_allocator's copy constructor does nothing. - */ - inline pinned_allocator(pinned_allocator const&) {} - - /*! This version of \p pinned_allocator's copy constructor - * is templated on the \c value_type of the \p pinned_allocator - * to copy from. It is provided merely for convenience; it - * does nothing. - */ - template - inline pinned_allocator(pinned_allocator const&) - { - } - - /*! This method returns the address of a \c reference of - * interest. - * - * \p r The \c reference of interest. - * \return \c r's address. - */ - inline pointer address(reference r) { return &r; } - - /*! This method returns the address of a \c const_reference - * of interest. - * - * \p r The \c const_reference of interest. - * \return \c r's address. - */ - inline const_pointer address(const_reference r) { return &r; } - - /*! This method allocates storage for objects in pinned host - * memory. - * - * \p cnt The number of objects to allocate. - * \return a \c pointer to the newly allocated objects. - * \note This method does not invoke \p value_type's constructor. - * It is the responsibility of the caller to initialize the - * objects at the returned \c pointer. - */ - inline pointer allocate(size_type cnt, const_pointer = 0) - { - if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - - pointer result(0); - cudaError_t error = cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type)); - - if (error) { throw std::bad_alloc(); } // end if - - return result; - } // end allocate() - - /*! This method deallocates pinned host memory previously allocated - * with this \c pinned_allocator. - * - * \p p A \c pointer to the previously allocated memory. - * \p cnt The number of objects previously allocated at - * \p p. - * \note This method does not invoke \p value_type's destructor. - * It is the responsibility of the caller to destroy - * the objects stored at \p p. - */ - inline void deallocate(pointer p, size_type /*cnt*/) - { - cudaError_t error = cudaFreeHost(p); - - if (error) { throw std::runtime_error("cudaFreeHost error\n"); } // end if - } // end deallocate() - - /*! This method returns the maximum size of the \c cnt parameter - * accepted by the \p allocate() method. - * - * \return The maximum number of objects that may be allocated - * by a single call to \p allocate(). - */ - inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } // end max_size() - - /*! This method tests this \p pinned_allocator for equality to - * another. - * - * \param x The other \p pinned_allocator of interest. - * \return This method always returns \c true. - */ - inline bool operator==([[maybe_unused]] pinned_allocator const& x) const { return true; } - - /*! This method tests this \p pinned_allocator for inequality - * to another. - * - * \param x The other \p pinned_allocator of interest. - * \return This method always returns \c false. - */ - inline bool operator!=(pinned_allocator const& x) const { return !operator==(x); } -}; // end pinned_allocator - -/*! \} - */