Skip to content

Commit

Permalink
Add support for cuFFTMp for running on arbitrary global grid dimensions.
Browse files Browse the repository at this point in the history
  • Loading branch information
semi-h committed Jan 27, 2025
1 parent 41a7ac1 commit d007dcc
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 29 deletions.
33 changes: 26 additions & 7 deletions src/case/base_case.f90
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,36 @@ subroutine print_enstrophy(self, u, v, w)
class(field_t), intent(in) :: u, v, w

class(field_t), pointer :: du, dv, dw
class(field_t), pointer :: f_out
real(dp) :: enstrophy
integer :: ierr, dims(3)

du => self%solver%backend%allocator%get_block(DIR_X, VERT)
dv => self%solver%backend%allocator%get_block(DIR_X, VERT)
dw => self%solver%backend%allocator%get_block(DIR_X, VERT)

call self%solver%curl(du, dv, dw, u, v, w)
enstrophy = 0.5_dp*(self%solver%backend%scalar_product(du, du) &
+ self%solver%backend%scalar_product(dv, dv) &
+ self%solver%backend%scalar_product(dw, dw)) &
/self%solver%ngrid

dims = self%solver%mesh%get_dims(VERT)

f_out => self%solver%host_allocator%get_block(DIR_C)

call self%solver%backend%get_field_data(f_out%data, du)
enstrophy = norm2(f_out%data(1:dims(1), 1:dims(2), 1:dims(3)))**2
call self%solver%backend%get_field_data(f_out%data, dv)
enstrophy = enstrophy &
+ norm2(f_out%data(1:dims(1), 1:dims(2), 1:dims(3)))**2
call self%solver%backend%get_field_data(f_out%data, dw)
enstrophy = enstrophy &
+ norm2(f_out%data(1:dims(1), 1:dims(2), 1:dims(3)))**2

enstrophy = 0.5_dp*enstrophy/self%solver%ngrid
call MPI_Allreduce(MPI_IN_PLACE, enstrophy, 1, MPI_DOUBLE_PRECISION, &
MPI_SUM, MPI_COMM_WORLD, ierr)
if (self%solver%mesh%par%is_root()) print *, 'enstrophy:', enstrophy

call self%solver%host_allocator%release_block(f_out)

call self%solver%backend%allocator%release_block(du)
call self%solver%backend%allocator%release_block(dv)
call self%solver%backend%allocator%release_block(dw)
Expand All @@ -158,7 +175,7 @@ subroutine print_div_max_mean(self, u, v, w)
class(field_t), pointer :: div_u
class(field_t), pointer :: u_out
real(dp) :: div_u_max, div_u_mean
integer :: ierr
integer :: ierr, dims(3)

div_u => self%solver%backend%allocator%get_block(DIR_Z)

Expand All @@ -169,8 +186,10 @@ subroutine print_div_max_mean(self, u, v, w)

call self%solver%backend%allocator%release_block(div_u)

div_u_max = maxval(abs(u_out%data))
div_u_mean = sum(abs(u_out%data))/self%solver%ngrid
dims = self%solver%mesh%get_dims(div_u%data_loc)
div_u_max = maxval(abs(u_out%data(1:dims(1), 1:dims(2), 1:dims(3))))
div_u_mean = sum(abs(u_out%data(1:dims(1), 1:dims(2), 1:dims(3)))) &
/self%solver%ngrid

call self%solver%host_allocator%release_block(u_out)

Expand Down
20 changes: 20 additions & 0 deletions src/cuda/kernels/spectral_processing.f90
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,26 @@ module m_cuda_spectral

contains

attributes(global) subroutine memcpy3D(dst, src, nx, ny, nz)
!! Copy data between x3d2 padded arrays and cuFFTMp descriptors
implicit none

real(dp), device, intent(inout), dimension(:, :, :) :: dst
real(dp), device, intent(in), dimension(:, :, :) :: src
integer, value, intent(in) :: nx, ny, nz

integer :: i, j, k

j = threadIdx%x + (blockIdx%x - 1)*blockDim%x !ny
k = blockIdx%y !nz

if (j <= ny) then
do i = 1, nx
dst(i, j, k) = src(i, j, k)
end do
end if
end subroutine memcpy3D

attributes(global) subroutine process_spectral_div_u( &
div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, &
ax, bx, ay, by, az, bz &
Expand Down
49 changes: 27 additions & 22 deletions src/cuda/poisson_fft.f90
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ module m_cuda_poisson_fft
use mpi

use m_allocator, only: field_t
use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, CELL
use m_common, only: dp, DIR_C, CELL
use m_mesh, only: mesh_t
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: dirps_t

use m_cuda_allocator, only: cuda_field_t
use m_cuda_spectral, only: process_spectral_div_u
use m_cuda_spectral, only: process_spectral_div_u, memcpy3D

implicit none

Expand Down Expand Up @@ -130,26 +130,30 @@ subroutine fft_forward_cuda(self, f)
class(cuda_poisson_fft_t) :: self
class(field_t), intent(in) :: f

real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)
real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)

type(cudaXtDesc), pointer :: descriptor

integer :: ierr
integer :: tsize, ierr
type(dim3) :: blocks, threads

select type (f)
type is (cuda_field_t)
flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
padded_dev => f%data_d
end select

call c_f_pointer(self%xtdesc%descriptor, descriptor)
call c_f_pointer(descriptor%data(1), d_dev, &
[self%nx_loc + 2, self%ny_loc*self%nz_loc])
ierr = cudaMemcpy2D(d_dev, self%nx_loc + 2, flat_dev, self%nx_loc, &
self%nx_loc, self%ny_loc*self%nz_loc)
if (ierr /= 0) then
print *, 'cudaMemcpy2D error code: ', ierr
error stop 'cudaMemcpy2D failed'
end if
[self%nx_loc + 2, self%ny_loc, self%nz_loc])

! tsize is different than SZ, because here we work on a 3D Cartesian
! data structure, and free to specify any suitable thread/block size.
tsize = 16
blocks = dim3((self%ny_loc - 1)/tsize + 1, self%nz_loc, 1)
threads = dim3(tsize, 1, 1)

call memcpy3D<<<blocks, threads>>>(d_dev, padded_dev, & !&
self%nx_loc, self%ny_loc, self%nz_loc)

ierr = cufftXtExecDescriptor(self%plan3D_fw, self%xtdesc, self%xtdesc, &
CUFFT_FORWARD)
Expand All @@ -167,11 +171,12 @@ subroutine fft_backward_cuda(self, f)
class(cuda_poisson_fft_t) :: self
class(field_t), intent(inout) :: f

real(dp), device, pointer :: flat_dev(:, :), d_dev(:, :, :)
real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)

type(cudaXtDesc), pointer :: descriptor

integer :: ierr
integer :: tsize, ierr
type(dim3) :: blocks, threads

ierr = cufftXtExecDescriptor(self%plan3D_bw, self%xtdesc, self%xtdesc, &
CUFFT_INVERSE)
Expand All @@ -182,18 +187,18 @@ subroutine fft_backward_cuda(self, f)

select type (f)
type is (cuda_field_t)
flat_dev(1:self%nx_loc, 1:self%ny_loc*self%nz_loc) => f%data_d
padded_dev => f%data_d
end select

call c_f_pointer(self%xtdesc%descriptor, descriptor)
call c_f_pointer(descriptor%data(1), d_dev, &
[self%nx_loc + 2, self%ny_loc*self%nz_loc])
ierr = cudaMemcpy2D(flat_dev, self%nx_loc, d_dev, self%nx_loc + 2, &
self%nx_loc, self%ny_loc*self%nz_loc)
if (ierr /= 0) then
print *, 'cudaMemcpy2D error code: ', ierr
error stop 'cudaMemcpy2D failed'
end if
[self%nx_loc + 2, self%ny_loc, self%nz_loc])

tsize = 16
blocks = dim3((self%ny_loc - 1)/tsize + 1, self%nz_loc, 1)
threads = dim3(tsize, 1, 1)
call memcpy3D<<<blocks, threads>>>(padded_dev, d_dev, & !&
self%nx_loc, self%ny_loc, self%nz_loc)

end subroutine fft_backward_cuda

Expand Down

0 comments on commit d007dcc

Please sign in to comment.