Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/fake-pr-to-test-gitlab-ci' into …
Browse files Browse the repository at this point in the history
…fake-pr-to-test-gitlab-ci
  • Loading branch information
Jacques Xing committed Jan 13, 2025
2 parents a25cba9 + f9adb64 commit d4e8666
Show file tree
Hide file tree
Showing 26 changed files with 615 additions and 319 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ stages:
- apt-get update
- apt-get install -y ccache
- apt-get install -y cmake
- apt-get install -y git
- apt-get install -y gcc
- apt-get install -y infiniband-diags ibverbs-utils
- apt-get install -y libibverbs-dev libfabric1 libfabric-dev libpsm2-dev
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set(CUDASRC
cuda/exec_dist.f90
cuda/exec_thom.f90
cuda/kernels/distributed.f90
cuda/kernels/fieldops.f90
cuda/kernels/reorder.f90
cuda/kernels/spectral_processing.f90
cuda/kernels/thomas.f90
Expand Down
4 changes: 2 additions & 2 deletions src/allocator.f90
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module m_allocator
use iso_fortran_env, only: stderr => error_unit

use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C, none, VERT
use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C, NULL_LOC, VERT
use m_mesh, only: mesh_t
use m_field, only: field_t

Expand Down Expand Up @@ -141,7 +141,7 @@ function get_block(self, dir, data_loc) result(handle)
if (present(data_loc)) then
handle%data_loc = data_loc
else
handle%data_loc = none
handle%data_loc = NULL_LOC
end if

! Set dims based on direction
Expand Down
23 changes: 20 additions & 3 deletions src/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ module m_base_backend
procedure(sum_intox), deferred :: sum_zintox
procedure(vecadd), deferred :: vecadd
procedure(scalar_product), deferred :: scalar_product
procedure(field_ops), deferred :: field_scale
procedure(field_ops), deferred :: field_shift
procedure(copy_data_to_f), deferred :: copy_data_to_f
procedure(copy_f_to_data), deferred :: copy_f_to_data
procedure(alloc_tdsops), deferred :: alloc_tdsops
Expand Down Expand Up @@ -145,6 +147,20 @@ real(dp) function scalar_product(self, x, y) result(s)
end function scalar_product
end interface

abstract interface
subroutine field_ops(self, f, a)
!! Scales or shifts a field by a
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(in) :: f
real(dp), intent(in) :: a
end subroutine field_ops
end interface

abstract interface
subroutine copy_data_to_f(self, f, data)
!! Copy the specialist data structure from device or host back
Expand Down Expand Up @@ -174,8 +190,8 @@ end subroutine copy_f_to_data
end interface

abstract interface
subroutine alloc_tdsops(self, tdsops, dir, operation, scheme, n_halo, &
from_to, bc_start, bc_end, sym, c_nu, nu0_nu)
subroutine alloc_tdsops(self, tdsops, dir, operation, scheme, bc_start, &
bc_end, n_halo, from_to, sym, c_nu, nu0_nu)
import :: base_backend_t
import :: dp
import :: tdsops_t
Expand All @@ -185,8 +201,9 @@ subroutine alloc_tdsops(self, tdsops, dir, operation, scheme, n_halo, &
class(tdsops_t), allocatable, intent(inout) :: tdsops
integer, intent(in) :: dir
character(*), intent(in) :: operation, scheme
integer, intent(in) :: bc_start, bc_end
integer, optional, intent(in) :: n_halo
character(*), optional, intent(in) :: from_to, bc_start, bc_end
character(*), optional, intent(in) :: from_to
logical, optional, intent(in) :: sym
real(dp), optional, intent(in) :: c_nu, nu0_nu
end subroutine alloc_tdsops
Expand Down
11 changes: 9 additions & 2 deletions src/common.f90
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ module m_common
X_EDGE = 1000, & ! Data on edges along X
Y_EDGE = 0100, & ! Data on edges along Y
Z_EDGE = 0010, & ! Data on edges along Z
none = -0001 ! The location of data isn't specified
integer, parameter :: BC_PERIODIC = 0, BC_NEUMANN = 1, BC_DIRICHLET = 2
NULL_LOC = -0001 ! The location of data isn't specified
integer, parameter :: BC_PERIODIC = 0, BC_NEUMANN = 1, BC_DIRICHLET = 2, &
BC_HALO = -1
integer, protected :: &
rdr_map(4, 4) = reshape([0, RDR_Y2X, RDR_Z2X, RDR_C2X, &
RDR_X2Y, 0, RDR_Z2Y, RDR_C2Y, &
Expand All @@ -47,4 +48,10 @@ pure integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir)
rdr_dir = rdr_map(dir_from, dir_to)
end function get_rdr_from_dirs

integer function move_data_loc(in_data_loc, dir, move) result(out_data_loc)
integer, intent(in) :: in_data_loc, dir, move

out_data_loc = in_data_loc + move*(10**dir)
end function move_data_loc

end module m_common
22 changes: 22 additions & 0 deletions src/cuda/allocator.f90
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ module m_cuda_allocator
real(dp), device, pointer, private :: p_data_d(:)
real(dp), device, pointer, contiguous :: data_d(:, :, :)
contains
procedure :: fill => fill_cuda
procedure :: get_shape => get_shape_cuda
procedure :: set_shape => set_shape_cuda
end type cuda_field_t

Expand All @@ -39,6 +41,26 @@ function cuda_field_init(ngrid, next, id) result(f)
f%id = id
end function cuda_field_init

subroutine fill_cuda(self, c)
implicit none

class(cuda_field_t) :: self
real(dp), intent(in) :: c

self%p_data_d(:) = c

end subroutine fill_cuda

function get_shape_cuda(self) result(dims)
implicit none

class(cuda_field_t) :: self
integer :: dims(3)

dims = shape(self%data_d)

end function get_shape_cuda

subroutine set_shape_cuda(self, dims)
implicit none

Expand Down
106 changes: 79 additions & 27 deletions src/cuda/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ module m_cuda_backend

use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, &
use m_common, only: dp, move_data_loc, &
RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, &
DIR_X, DIR_Y, DIR_Z, DIR_C, VERT
DIR_X, DIR_Y, DIR_Z, DIR_C, VERT, NULL_LOC
use m_mesh, only: mesh_t
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: dirps_t, tdsops_t, get_tds_n
Expand All @@ -20,10 +20,12 @@ module m_cuda_backend
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
use m_cuda_tdsops, only: cuda_tdsops_t
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_reorder, only: &
reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
reorder_z2y, reorder_c2x, reorder_x2c, &
sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
use m_cuda_kernels_fieldops, only: axpby, buffer_copy, field_scale, &
field_shift, scalar_product
use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
reorder_y2z, reorder_z2x, reorder_z2y, &
reorder_c2x, reorder_x2c, &
sum_yintox, sum_zintox

implicit none

Expand Down Expand Up @@ -51,6 +53,8 @@ module m_cuda_backend
procedure :: sum_zintox => sum_zintox_cuda
procedure :: vecadd => vecadd_cuda
procedure :: scalar_product => scalar_product_cuda
procedure :: field_scale => field_scale_cuda
procedure :: field_shift => field_shift_cuda
procedure :: copy_data_to_f => copy_data_to_f_cuda
procedure :: copy_f_to_data => copy_f_to_data_cuda
procedure :: init_poisson_fft => init_cuda_poisson_fft
Expand Down Expand Up @@ -126,17 +130,18 @@ function init(mesh, allocator) result(backend)
end function init

subroutine alloc_cuda_tdsops( &
self, tdsops, dir, operation, scheme, &
n_halo, from_to, bc_start, bc_end, sym, c_nu, nu0_nu &
self, tdsops, dir, operation, scheme, bc_start, bc_end, &
n_halo, from_to, sym, c_nu, nu0_nu &
)
implicit none

class(cuda_backend_t) :: self
class(tdsops_t), allocatable, intent(inout) :: tdsops
integer, intent(in) :: dir
character(*), intent(in) :: operation, scheme
integer, intent(in) :: bc_start, bc_end
integer, optional, intent(in) :: n_halo
character(*), optional, intent(in) :: from_to, bc_start, bc_end
character(*), optional, intent(in) :: from_to
logical, optional, intent(in) :: sym
real(dp), optional, intent(in) :: c_nu, nu0_nu
integer :: tds_n
Expand All @@ -148,9 +153,8 @@ subroutine alloc_cuda_tdsops( &
type is (cuda_tdsops_t)
tds_n = get_tds_n(self%mesh, dir, from_to)
delta = self%mesh%geo%d(dir)
tdsops = cuda_tdsops_t(tds_n, delta, operation, &
scheme, n_halo, from_to, &
bc_start, bc_end, sym, c_nu, nu0_nu)
tdsops = cuda_tdsops_t(tds_n, delta, operation, scheme, bc_start, &
bc_end, n_halo, from_to, sym, c_nu, nu0_nu)
end select

end subroutine alloc_cuda_tdsops
Expand Down Expand Up @@ -250,6 +254,10 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, &
der1st_sym, der1st, der2nd_sym, dirps%dir, &
blocks, threads)

call du%set_data_loc(u%data_loc)
call dv%set_data_loc(v%data_loc)
call dw%set_data_loc(w%data_loc)

end subroutine transeq_cuda_dist

subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dir)
Expand Down Expand Up @@ -284,16 +292,17 @@ subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dir)

end subroutine transeq_halo_exchange

subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, &
subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, &
u_recv_s_dev, u_recv_e_dev, &
conv_recv_s_dev, conv_recv_e_dev, &
tdsops_du, tdsops_dud, tdsops_d2u, &
dir, blocks, threads)
!! Computes RHS_x^u following:
!!
!! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
!! Computes RHS_x^u following:
!!
!! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
class(cuda_backend_t) :: self
real(dp), device, dimension(:, :, :), intent(inout) :: rhs_dev
!> The result field, it is also used as temporary storage
real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev
real(dp), device, dimension(:, :, :), intent(in) :: u_dev, conv_dev
real(dp), device, dimension(:, :, :), intent(in) :: &
u_recv_s_dev, u_recv_e_dev, &
Expand All @@ -302,25 +311,22 @@ subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, &
integer, intent(in) :: dir
type(dim3), intent(in) :: blocks, threads

class(field_t), pointer :: du, dud, d2u
class(field_t), pointer :: dud, d2u

real(dp), device, pointer, dimension(:, :, :) :: &
du_dev, dud_dev, d2u_dev
real(dp), device, pointer, dimension(:, :, :) :: dud_dev, d2u_dev

! Get some fields for storing the intermediate results
du => self%allocator%get_block(dir, VERT)
dud => self%allocator%get_block(dir, VERT)
d2u => self%allocator%get_block(dir, VERT)
dud => self%allocator%get_block(dir)
d2u => self%allocator%get_block(dir)

call resolve_field_t(du_dev, du)
call resolve_field_t(dud_dev, dud)
call resolve_field_t(d2u_dev, d2u)

call exec_dist_transeq_3fused( &
rhs_dev, &
rhs_du_dev, &
u_dev, u_recv_s_dev, u_recv_e_dev, &
conv_dev, conv_recv_s_dev, conv_recv_e_dev, &
du_dev, dud_dev, d2u_dev, &
dud_dev, d2u_dev, &
self%du_send_s_dev, self%du_send_e_dev, &
self%du_recv_s_dev, self%du_recv_e_dev, &
self%dud_send_s_dev, self%dud_send_e_dev, &
Expand All @@ -333,7 +339,6 @@ subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, &
)

! Release temporary blocks
call self%allocator%release_block(du)
call self%allocator%release_block(dud)
call self%allocator%release_block(d2u)

Expand Down Expand Up @@ -369,6 +374,10 @@ subroutine tds_solve_cuda(self, du, u, tdsops)

blocks = dim3(self%mesh%get_n_groups(u), 1, 1); threads = dim3(SZ, 1, 1)

if (u%data_loc /= NULL_LOC) then
call du%set_data_loc(move_data_loc(u%data_loc, u%dir, tdsops%move))
end if

call tds_solve_dist(self, du, u, tdsops, blocks, threads)

end subroutine tds_solve_cuda
Expand Down Expand Up @@ -538,6 +547,9 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
error stop 'Reorder direction is undefined.'
end select

! reorder keeps the data_loc the same
call u_o%set_data_loc(u_i%data_loc)

end subroutine reorder_cuda

subroutine sum_yintox_cuda(self, u, u_y)
Expand Down Expand Up @@ -654,6 +666,46 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)

end subroutine copy_into_buffers

subroutine field_scale_cuda(self, f, a)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(in) :: f
real(dp), intent(in) :: a

real(dp), device, pointer, dimension(:, :, :) :: f_d
type(dim3) :: blocks, threads
integer :: n

call resolve_field_t(f_d, f)

n = size(f_d, dim=2)
blocks = dim3(size(f_d, dim=3), 1, 1)
threads = dim3(SZ, 1, 1)
call field_scale<<<blocks, threads>>>(f_d, a, n) !&

end subroutine field_scale_cuda

subroutine field_shift_cuda(self, f, a)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(in) :: f
real(dp), intent(in) :: a

real(dp), device, pointer, dimension(:, :, :) :: f_d
type(dim3) :: blocks, threads
integer :: n

call resolve_field_t(f_d, f)

n = size(f_d, dim=2)
blocks = dim3(size(f_d, dim=3), 1, 1)
threads = dim3(SZ, 1, 1)
call field_shift<<<blocks, threads>>>(f_d, a, n) !&

end subroutine field_shift_cuda

subroutine copy_data_to_f_cuda(self, f, data)
class(cuda_backend_t), intent(inout) :: self
class(field_t), intent(inout) :: f
Expand Down
Loading

0 comments on commit d4e8666

Please sign in to comment.