Skip to content

Commit

Permalink
From array operations to loops.
Browse files Browse the repository at this point in the history
  • Loading branch information
p-costa committed Feb 16, 2025
1 parent f6ee877 commit 6c14885
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 190 deletions.
20 changes: 12 additions & 8 deletions src/main.f90
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,19 @@ program cans
dzci(k) = dzc(k)**(-1)
dzfi(k) = dzf(k)**(-1)
end do
!$acc kernels default(present) async
dzci_g(:) = dzc_g(:)**(-1)
dzfi_g(:) = dzf_g(:)**(-1)
!$acc end kernels
!$acc data copy(ng) async
!$acc parallel loop default(present) async
do k=0,ng(3)+1
dzci_g(k) = dzc_g(k)**(-1)
dzfi_g(k) = dzf_g(k)**(-1)
end do
!$acc end data
!$acc enter data create(grid_vol_ratio_c,grid_vol_ratio_f) async
!$acc kernels default(present) async
grid_vol_ratio_c(:) = dl(1)*dl(2)*dzc(:)/(l(1)*l(2)*l(3))
grid_vol_ratio_f(:) = dl(1)*dl(2)*dzf(:)/(l(1)*l(2)*l(3))
!$acc end kernels
!$acc parallel loop default(present) async
do k=0,n(3)+1
grid_vol_ratio_c(k) = dl(1)*dl(2)*dzc(k)/(l(1)*l(2)*l(3))
grid_vol_ratio_f(k) = dl(1)*dl(2)*dzf(k)/(l(1)*l(2)*l(3))
end do
!$acc update self(zc,zf,dzc,dzf,dzci,dzfi) async
!$acc exit data copyout(zc_g,zf_g,dzc_g,dzf_g,dzci_g,dzfi_g) async ! not needed on the device
!$acc wait
Expand Down
37 changes: 28 additions & 9 deletions src/mom.f90
Original file line number Diff line number Diff line change
Expand Up @@ -607,23 +607,42 @@ subroutine bulk_forcing(n,is_forced,f,u,v,w)
real(rp), intent(in ), dimension(3) :: f
real(rp), intent(inout), dimension(0:,0:,0:) :: u,v,w
real(rp) :: ff
integer :: i,j,k
if(is_forced(1)) then
ff = f(1)
!$acc kernels default(present) async(1)
u(1:n(1),1:n(2),1:n(3)) = u(1:n(1),1:n(2),1:n(3)) + ff
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
u(i,j,k) = u(i,j,k) + ff
end do
end do
end do
end if
if(is_forced(2)) then
ff = f(2)
!$acc kernels default(present) async(1)
v(1:n(1),1:n(2),1:n(3)) = v(1:n(1),1:n(2),1:n(3)) + ff
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
v(i,j,k) = v(i,j,k) + ff
end do
end do
end do
end if
if(is_forced(3)) then
ff = f(3)
!$acc kernels default(present) async(1)
w(1:n(1),1:n(2),1:n(3)) = w(1:n(1),1:n(2),1:n(3)) + ff
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
w(i,j,k) = w(i,j,k) + ff
end do
end do
end do
end if
end subroutine bulk_forcing
!
Expand Down
7 changes: 4 additions & 3 deletions src/output.f90
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ subroutine out1d(fname,ng,lo,hi,idir,l,dl,z_g,dz,p)
!
allocate(p1d(ng(idir)))
!$acc enter data create(p1d)
!$acc kernels default(present)
p1d(:) = 0._rp
!$acc end kernels
!$acc parallel loop default(present)
do k=1,size(p1d)
p1d(k) = 0._rp
end do
select case(idir)
case(3)
grid_area_ratio = dl(1)*dl(2)/(l(1)*l(2))
Expand Down
94 changes: 62 additions & 32 deletions src/rk.f90
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,17 @@ subroutine rk(rkpar,n,dli,dzci,dzfi,grid_vol_ratio_c,grid_vol_ratio_f,visc,dt,p,
allocate(dudtrko_t(n(1),n(2),n(3)),dvdtrko_t(n(1),n(2),n(3)),dwdtrko_t(n(1),n(2),n(3)))
!$acc enter data create(dudtrk_t ,dvdtrk_t ,dwdtrk_t ) async(1)
!$acc enter data create(dudtrko_t,dvdtrko_t,dwdtrko_t) async(1)
!$acc kernels default(present) async(1) ! not really necessary
dudtrko_t(:,:,:) = 0._rp
dvdtrko_t(:,:,:) = 0._rp
dwdtrko_t(:,:,:) = 0._rp
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1) ! not really necessary
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
dudtrko_t(i,j,k) = 0._rp
dvdtrko_t(i,j,k) = 0._rp
dwdtrko_t(i,j,k) = 0._rp
end do
end do
end do
if(is_impdiff) then
allocate(dudtrkd(n(1),n(2),n(3)),dvdtrkd(n(1),n(2),n(3)),dwdtrkd(n(1),n(2),n(3)))
!$acc enter data create(dudtrkd,dvdtrkd,dwdtrkd) async(1)
Expand All @@ -85,25 +91,33 @@ subroutine rk(rkpar,n,dli,dzci,dzfi,grid_vol_ratio_c,grid_vol_ratio_f,visc,dt,p,
if(is_fast_mom_kernels) then
call mom_xyz_ad(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,u,v,w,dudtrk,dvdtrk,dwdtrk,dudtrkd,dvdtrkd,dwdtrkd)
else
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
dudtrk(:,:,:) = 0._rp
dvdtrk(:,:,:) = 0._rp
dwdtrk(:,:,:) = 0._rp
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
dudtrk(i,j,k) = 0._rp
dvdtrk(i,j,k) = 0._rp
dwdtrk(i,j,k) = 0._rp
end do
end do
end do
if(.not.is_impdiff) then
call momx_d(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,u,dudtrk)
call momy_d(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,v,dvdtrk)
call momz_d(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,w,dwdtrk)
else
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
dudtrkd(:,:,:) = 0._rp
dvdtrkd(:,:,:) = 0._rp
dwdtrkd(:,:,:) = 0._rp
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
dudtrkd(i,j,k) = 0._rp
dvdtrkd(i,j,k) = 0._rp
dwdtrkd(i,j,k) = 0._rp
end do
end do
end do
if(.not.is_impdiff_1d) then
call momx_d(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,u,dudtrkd)
call momy_d(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,visc,v,dvdtrkd)
Expand Down Expand Up @@ -236,13 +250,17 @@ subroutine rk(rkpar,n,dli,dzci,dzfi,grid_vol_ratio_c,grid_vol_ratio_f,visc,dt,p,
call swap(dvdtrk,dvdtrko)
call swap(dwdtrk,dwdtrko)
!#if 0 /*pressure gradient term treated explicitly above */
! !$acc kernels
! !$OMP PARALLEL WORKSHARE
! dudtrk(:,:,:) = 0._rp
! dvdtrk(:,:,:) = 0._rp
! dwdtrk(:,:,:) = 0._rp
! !$OMP END PARALLEL WORKSHARE
! !$acc end kernels
! !$acc parallel loop collapse(3) default(present) async(1)
! !$OMP parallel do collapse(3) DEFAULT(shared)
! do k=1,n(3)
! do j=1,n(2)
! do i=1,n(1)
! dudtrk(i,j,k) = 0._rp
! dvdtrk(i,j,k) = 0._rp
! dwdtrk(i,j,k) = 0._rp
! end do
! end do
! end do
! call momx_p(n(1),n(2),n(3),dli(1),bforce(1),p,dudtrk)
! call momy_p(n(1),n(2),n(3),dli(2),bforce(2),p,dvdtrk)
! call momz_p(n(1),n(2),n(3),dzci ,bforce(3),p,dwdtrk)
Expand Down Expand Up @@ -336,9 +354,15 @@ subroutine rk_scal(rkpar,iscal,nscal,n,dli,l,dzci,dzfi,grid_vol_ratio_f,alpha,dt
end if
allocate(dsdtrk_t(iscal)%s(n(1),n(2),n(3)),dsdtrko_t(iscal)%s(n(1),n(2),n(3)))
!$acc enter data create(dsdtrk_t(iscal)%s,dsdtrko_t(iscal)%s) async(1)
!$acc kernels default(present) async(1) ! not really necessary
dsdtrko_t(iscal)%s(:,:,:) = 0._rp
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
dsdtrko_t(iscal)%s(i,j,k) = 0._rp
end do
end do
end do
dsdtrk(iscal)%s => dsdtrk_t(iscal)%s
dsdtrko(iscal)%s => dsdtrko_t(iscal)%s
!$acc enter data attach(dsdtrk(iscal)%s,dsdtrko(iscal)%s) async(1)
Expand All @@ -352,9 +376,15 @@ subroutine rk_scal(rkpar,iscal,nscal,n,dli,l,dzci,dzfi,grid_vol_ratio_f,alpha,dt
allocate(dsdtrkd(iscal)%s(0,0,0))
end if
!$acc enter data create(dsdtrkd(iscal)%s) async(1)
!$acc kernels default(present) async(1)
dsdtrkd(iscal)%s(:,:,:) = 0._rp
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,size(dsdtrkd(iscal)%s,3)
do j=1,size(dsdtrkd(iscal)%s,2)
do i=1,size(dsdtrkd(iscal)%s,1)
dsdtrkd(iscal)%s(i,j,k) = 0._rp
end do
end do
end do
end if
!
call scal(n(1),n(2),n(3),dli(1),dli(2),dzci,dzfi,alpha,u,v,w,s,dsdtrk(iscal)%s, &
Expand Down
78 changes: 58 additions & 20 deletions src/sanity.f90
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
real(rp), dimension(3) :: dl
real(rp) :: dt,dti,alpha
real(rp) :: divtot,divmax,resmax
integer :: i,j,k
logical :: passed,passed_loc
passed = .true.
!$acc wait
Expand All @@ -242,6 +243,7 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
rhsbx(n(2),n(3),0:1), &
rhsby(n(1),n(3),0:1), &
rhsbz(n(1),n(2),0:1))
!$acc enter data copyin(n,n_z)
!
! initialize velocity below with some random noise
!
Expand Down Expand Up @@ -281,11 +283,17 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
if(is_impdiff .and. .not.is_impdiff_1d) then
allocate(bb(n_z(3)))
alpha = acos(-1.) ! irrelevant
!$acc kernels default(present)
u(:,:,:) = 0.
v(:,:,:) = 0.
w(:,:,:) = 0.
!$acc end kernels
!$acc parallel loop collapse(3) default(present)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=0,n(3)+1
do j=0,n(2)+1
do i=0,n(1)+1
u(i,j,k) = 0.
v(i,j,k) = 0.
w(i,j,k) = 0.
end do
end do
end do
!$acc update self(u,v,w)
call add_noise(ng,lo,123,.5_rp,u(1:n(1),1:n(2),1:n(3)))
call add_noise(ng,lo,456,.5_rp,v(1:n(1),1:n(2),1:n(3)))
Expand All @@ -299,11 +307,21 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
call set_cufft_wspace(pack(arrplan,.true.),acc_get_cuda_stream(1))
#endif
call bounduvw(cbcvel,n,bcvel,nb,is_bound,.false.,dl,dzc,dzf,u,v,w)
!$acc kernels default(present)
u(:,:,:) = u(:,:,:)*alpha
p(:,:,:) = u(:,:,:)
bb(:) = b(:) + alpha
!$acc end kernels
!$acc parallel loop collapse(3) default(present)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=0,n(3)+1
do j=0,n(2)+1
do i=0,n(1)+1
u(i,j,k) = u(i,j,k)*alpha
p(i,j,k) = u(i,j,k)
end do
end do
end do
!$acc parallel loop default(present)
!$OMP parallel do DEFAULT(shared)
do k=1,n_z(3)
bb(k) = b(k) + alpha
end do
call updt_rhs_b(['f','c','c'],cbcvel(:,:,1),n,is_bound,rhsbx,rhsby,rhsbz,u)
call solver(n,ng,arrplan,normfft,lambdaxy,a,bb,c,cbcvel(:,:,1),['f','c','c'],u)
call fftend(arrplan)
Expand All @@ -321,11 +339,21 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
call set_cufft_wspace(pack(arrplan,.true.),acc_get_cuda_stream(1))
#endif
call bounduvw(cbcvel,n,bcvel,nb,is_bound,.false.,dl,dzc,dzf,u,v,w)
!$acc kernels default(present)
v(:,:,:) = v(:,:,:)*alpha
p(:,:,:) = v(:,:,:)
bb(:) = b(:) + alpha
!$acc end kernels
!$acc parallel loop collapse(3) default(present)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=0,n(3)+1
do j=0,n(2)+1
do i=0,n(1)+1
v(i,j,k) = v(i,j,k)*alpha
p(i,j,k) = v(i,j,k)
end do
end do
end do
!$acc parallel loop default(present)
!$OMP parallel do DEFAULT(shared)
do k=1,n_z(3)
bb(k) = b(k) + alpha
end do
call updt_rhs_b(['c','f','c'],cbcvel(:,:,2),n,is_bound,rhsbx,rhsby,rhsbz,v)
call solver(n,ng,arrplan,normfft,lambdaxy,a,bb,c,cbcvel(:,:,2),['c','f','c'],v)
call fftend(arrplan)
Expand All @@ -343,11 +371,21 @@ subroutine test_sanity_solver(ng,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,dli,dzc,d
call set_cufft_wspace(pack(arrplan,.true.),acc_get_cuda_stream(1))
#endif
call bounduvw(cbcvel,n,bcvel,nb,is_bound,.false.,dl,dzc,dzf,u,v,w)
!$acc kernels default(present)
w(:,:,:) = w(:,:,:)*alpha
p(:,:,:) = w(:,:,:)
bb(:) = b(:) + alpha
!$acc end kernels
!$acc parallel loop collapse(3) default(present)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=0,n(3)+1
do j=0,n(2)+1
do i=0,n(1)+1
w(i,j,k) = w(i,j,k)*alpha
p(i,j,k) = w(i,j,k)
end do
end do
end do
!$acc parallel loop default(present)
!$OMP parallel do DEFAULT(shared)
do k=1,n_z(3)
bb(k) = b(k) + alpha
end do
call updt_rhs_b(['c','c','f'],cbcvel(:,:,3),n,is_bound,rhsbx,rhsby,rhsbz,w)
call solver(n,ng,arrplan,normfft,lambdaxy,a,bb,c,cbcvel(:,:,3),['c','c','f'],w)
call fftend(arrplan)
Expand Down
15 changes: 10 additions & 5 deletions src/scal.f90
Original file line number Diff line number Diff line change
Expand Up @@ -302,12 +302,17 @@ subroutine bulk_forcing_s(n,is_forced,ff,p)
logical , intent(in ) :: is_forced
real(rp), intent(in ) :: ff
real(rp), intent(inout), dimension(0:,0:,0:) :: p
integer :: i,j,k
if(is_forced) then
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = p(1:n(1),1:n(2),1:n(3)) + ff
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP parallel do collapse(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
p(i,j,k) = p(i,j,k) + ff
end do
end do
end do
end if
end subroutine bulk_forcing_s
!
Expand Down
13 changes: 8 additions & 5 deletions src/solve_helmholtz.f90
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ subroutine solve_helmholtz(n,ng,hi,arrplan,normfft,alpha,lambdaxy,a,b,c,rhsbx,rh
real(rp), intent(inout), dimension(:,:,:) :: p
real(rp), allocatable, dimension(:), save :: bb
real(rp) :: alphai
integer :: k
!
logical, save :: is_first = .true.
!
Expand All @@ -58,12 +59,14 @@ subroutine solve_helmholtz(n,ng,hi,arrplan,normfft,alpha,lambdaxy,a,b,c,rhsbx,rh
end if
!
call updt_rhs_b(c_or_f,cbc,n,is_bound,rhsbx,rhsby,rhsbz,p)
!
alphai = alpha**(-1)
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
bb(:) = b(:) + alphai
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc parallel loop default(present) async(1)
!$OMP PARALLEL DO DEFAULT(shared)
do k=1,size(b)
bb(k) = b(k) + alphai
end do
!
if(.not.is_impdiff_1d) then
call solver(n,ng,arrplan,normfft*alphai,lambdaxy,a,bb,c,cbc,c_or_f,p)
else
Expand Down
Loading

0 comments on commit 6c14885

Please sign in to comment.