diff --git a/Makefile b/Makefile index b66fbc7..3ae4dc2 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ else ifeq ($(CMP),nvhpc) FC = mpif90 FFLAGS += -cpp -O3 -march=native FFLAGS += -Minfo=accel -stdpar -acc -target=multicore -# FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -fast -O3 -Minstrument + FFLAGS = -cpp -Mfree -Kieee -Minfo=accel,ftn,inline,loop,vect,opt,stdpar -stdpar=gpu -gpu=cc70,managed,lineinfo,deepcopy -acc -target=gpu -traceback -O3 -Minstrument -g LFLAGS += -acc -lnvhpcwrapnvtx endif @@ -70,9 +70,9 @@ SRC = $(SRCDIR)/x3d_precision.f90 $(SRCDIR)/module_param.f90 $(SRCDIR)/time_inte ifeq ($(FFT),fftw3) #FFTW3_PATH=/usr #FFTW3_PATH=/usr/lib64 - FFTW3_PATH=/usr/local/Cellar/fftw/3.3.7_1 + FFTW3_PATH=/opt/software/builder/developers/libraries/fftw/3.3.8/1/gcc-10.2.0-openmpi-4.0.5 INC=-I$(FFTW3_PATH)/include - LIBFFT=-L$(FFTW3_PATH) -lfftw3 -lfftw3f + LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f else ifeq ($(FFT),fftw3_f03) FFTW3_PATH=/usr #ubuntu # apt install libfftw3-dev #FFTW3_PATH=/usr/lib64 #fedora # dnf install fftw fftw-devel diff --git a/src/thomas.f90 b/src/thomas.f90 index a0a2fb6..48ab70b 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -27,6 +27,11 @@ module thomas module procedure zthomas_12 end interface zthomas + interface thomas1d + module procedure thomas1d_0 + module procedure thomas1d_12 + end interface thomas1d + contains @@ -181,7 +186,9 @@ end subroutine zthomas_12 ! fw, in, used during the backward step ! nn, in, size of the vector ! - pure subroutine thomas1d(tt, ff, fs, fw, nn) + pure subroutine thomas1d_12(tt, ff, fs, fw, nn) + + !$acc routine seq implicit none @@ -199,6 +206,28 @@ pure subroutine thomas1d(tt, ff, fs, fw, nn) tt(k) = (tt(k)-ff(k)*tt(k+1)) * fw(k) enddo - end subroutine thomas1d + end subroutine thomas1d_12 + + pure subroutine thomas1d_0(tt, ff, fs, fw, perio, alfa, nn) + + !$acc routine seq + + implicit none + + integer, intent(in) :: nn + real(mytype), intent(inout), dimension(nn) :: tt + real(mytype), intent(in), dimension(nn) :: ff, fs, fw, perio + real(mytype), intent(in) :: alfa + + integer :: k + real(mytype) :: ss + + call thomas1d_12(tt, ff, fs, fw, nn) + ss = (tt(1)-alfa*tt(nn)) / (one + perio(1) - alfa*perio(nn)) + do k = 1, nn + tt(k) = tt(k) - ss*perio(k) + enddo + + end subroutine thomas1d_0 end module thomas diff --git a/src/transeq.f90 b/src/transeq.f90 index b1078c4..215833f 100644 --- a/src/transeq.f90 +++ b/src/transeq.f90 @@ -75,11 +75,17 @@ subroutine momentum_rhs_eq(dux1,duy1,duz1,ux1,uy1,uz1) enddo call derx (td1,ta1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "td1", minval(td1), maxval(td1) call derx (te1,tb1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "te1", minval(te1), maxval(te1) call derx (tf1,tc1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "tf1", minval(tf1), maxval(tf1) call derx (ta1,ux1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "ta1", minval(ta1), maxval(ta1) call derx (tb1,uy1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "tb1", minval(tb1), maxval(tb1) call derx (tc1,uz1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "tc1", minval(tc1), maxval(tc1) ! Convective terms of x-pencil are stored in tg1,th1,ti1 do concurrent (k=1:xsize(3), j=1:xsize(2), i=1:xsize(1)) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 808ae55..9c93987 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -183,6 +183,8 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -191,26 +193,61 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - - do concurrent (k=1:nz, j=1:ny) - ! Compute r.h.s. - tx(1,j,k) = afix*(ux(2,j,k)-ux(nx,j,k)) & - + bfix*(ux(3,j,k)-ux(nx-1,j,k)) - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & - + bfix*(ux(4,j,k)-ux(nx,j,k)) - do concurrent (i=3:nx-2) - tx(i,j,k) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & - + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) - enddo - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & - + bfix*(ux(1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & - + bfix*(ux(2,j,k)-ux(nx-2,j,k)) - +#if 1 + real(mytype), dimension(nx) :: buffer + + ! This is working (with deepcopy) + !$acc parallel loop gang vector collapse(2) private(buffer) + do k = 1, nz + do j = 1, ny + ! Compute r.h.s. + buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + + bfix*(ux(3,j,k)-ux(nx-1,j,k)) + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + + bfix*(ux(4,j,k)-ux(nx,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) + enddo + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + + bfix*(ux(1,j,k)-ux(nx-3,j,k)) + buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + + bfix*(ux(2,j,k)-ux(nx-2,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo enddo - - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + enddo + !$acc end parallel loop +#else + real(mytype), dimension(nx) :: buffer + + ! This is broken (with deepcopy) + do concurrent (k=1:nz, j=1:ny) local(buffer) + ! Compute r.h.s. + buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + + bfix*(ux(3,j,k)-ux(nx-1,j,k)) + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + + bfix*(ux(4,j,k)-ux(nx,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) + enddo + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + + bfix*(ux(1,j,k)-ux(nx-3,j,k)) + buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + + bfix*(ux(2,j,k)-ux(nx-2,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo +#endif end subroutine derx_00 @@ -222,6 +259,8 @@ subroutine derx_ij(tx,ux,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -230,48 +269,52 @@ subroutine derx_ij(tx,ux,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - tx(1,j,k) = zero - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & + buffer(1) = zero + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + bfix*(ux(4,j,k)-ux(2,j,k)) else - tx(1,j,k) = afix*(ux(2,j,k)+ux(2,j,k)) & + buffer(1) = afix*(ux(2,j,k)+ux(2,j,k)) & + bfix*(ux(3,j,k)+ux(3,j,k)) - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + bfix*(ux(4,j,k)+ux(2,j,k)) endif else - tx(1,j,k) = af1x*ux(1,j,k) + bf1x*ux(2,j,k) + cf1x*ux(3,j,k) - tx(2,j,k) = af2x*(ux(3,j,k)-ux(1,j,k)) + buffer(1) = af1x*ux(1,j,k) + bf1x*ux(2,j,k) + cf1x*ux(3,j,k) + buffer(2) = af2x*(ux(3,j,k)-ux(1,j,k)) endif do concurrent (i=3:nx-2) - tx(i,j,k) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) enddo ! nx-1 <= i <= nx if (ncln==1) then if (npaire==1) then - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + bfix*(ux(nx-1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = zero + buffer(nx) = zero else - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + bfix*((-ux(nx-1,j,k))-ux(nx-3,j,k)) - tx(nx,j,k) = afix*((-ux(nx-1,j,k))-ux(nx-1,j,k)) & + buffer(nx) = afix*((-ux(nx-1,j,k))-ux(nx-1,j,k)) & + bfix*((-ux(nx-2,j,k))-ux(nx-2,j,k)) endif else - tx(nx-1,j,k) = afmx*(ux(nx,j,k)-ux(nx-2,j,k)) - tx(nx,j,k) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k) + buffer(nx-1) = afmx*(ux(nx,j,k)-ux(nx-2,j,k)) + buffer(nx) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k) endif - enddo - ! Solve tri-diagonal system - call xthomas(tx, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derx_ij @@ -345,6 +388,8 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -354,40 +399,44 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp - ! Compute r.h.s. - do concurrent (k=1:nz) - do concurrent (i=1:nx) - ty(i,1,k) = afjy*(uy(i,2,k)-uy(i,ny,k)) & - + bfjy*(uy(i,3,k)-uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)-uy(i,ny,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + pp(j) = x3dop%periodic(j) + end do + + do concurrent (k=1:nz, i=1:nx) local(buffer) + ! Compute r.h.s. + buffer(1) = afjy*(uy(i,2,k)-uy(i,ny,k)) & + + bfjy*(uy(i,3,k)-uy(i,ny-1,k)) + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)-uy(i,ny,k)) + do concurrent (j=3:ny-2) + buffer(j) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + bfjy*(uy(i,j+2,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*(uy(i,1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = afjy*(uy(i,1,k)-uy(i,ny-1,k)) & - + bfjy*(uy(i,2,k)-uy(i,ny-2,k)) - enddo - enddo - - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*(uy(i,1,k)-uy(i,ny-3,k)) + buffer(ny) = afjy*(uy(i,1,k)-uy(i,ny-1,k)) & + + bfjy*(uy(i,2,k)-uy(i,ny-2,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + + ! Apply stretching if needed + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif - ! Apply stretching if needed - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo - endif + enddo end subroutine dery_00 @@ -399,6 +448,8 @@ subroutine dery_ij(ty,uy,ff,fs,fw,ppy,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -407,79 +458,60 @@ subroutine dery_ij(ty,uy,ff,fs,fw,ppy,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)-uy(i,2,k)) - enddo + buffer(1) = zero + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)-uy(i,2,k)) else - do concurrent (i=1:nx) - ty(i,1,k) = afjy*(uy(i,2,k)+uy(i,2,k)) & - + bfjy*(uy(i,3,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)+uy(i,2,k)) - enddo + buffer(1) = afjy*(uy(i,2,k)+uy(i,2,k)) & + + bfjy*(uy(i,3,k)+uy(i,3,k)) + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)+uy(i,2,k)) endif else - do concurrent (i=1:nx) - ty(i,1,k) = af1y*uy(i,1,k)+bf1y*uy(i,2,k)+cf1y*uy(i,3,k) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = af2y*(uy(i,3,k)-uy(i,1,k)) - enddo + buffer(1) = af1y*uy(i,1,k)+bf1y*uy(i,2,k)+cf1y*uy(i,3,k) + buffer(2) = af2y*(uy(i,3,k)-uy(i,1,k)) endif - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + do concurrent (j=3:ny-2) + buffer(j) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + bfjy*(uy(i,j+2,k)-uy(i,j-2,k)) enddo if (ncln==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*(uy(i,ny-1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = zero - enddo + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*(uy(i,ny-1,k)-uy(i,ny-3,k)) + buffer(ny) = zero else - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*((-uy(i,ny-1,k))-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = afjy*((-uy(i,ny-1,k))-uy(i,ny-1,k)) & - + bfjy*((-uy(i,ny-2,k))-uy(i,ny-2,k)) - enddo + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*((-uy(i,ny-1,k))-uy(i,ny-3,k)) + buffer(ny) = afjy*((-uy(i,ny-1,k))-uy(i,ny-1,k)) & + + bfjy*((-uy(i,ny-2,k))-uy(i,ny-2,k)) endif else - do concurrent (i=1:nx) - ty(i,ny-1,k) = afmy*(uy(i,ny,k)-uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = -afny*uy(i,ny,k)-bfny*uy(i,ny-1,k)-cfny*uy(i,ny-2,k) - enddo + buffer(ny-1) = afmy*(uy(i,ny,k)-uy(i,ny-2,k)) + buffer(ny) = -afny*uy(i,ny,k)-bfny*uy(i,ny-1,k)-cfny*uy(i,ny-2,k) endif - enddo - ! Solve tri-diagonal system - call ythomas(ty, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, ny) + + ! Apply stretching if needed + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif - ! Apply stretching if needed - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) + do concurrent(j=1:ny) + ty(i,j,k) = buffer(j) enddo - endif + enddo end subroutine dery_ij @@ -555,6 +587,8 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx,ny,nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -563,6 +597,7 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -571,30 +606,34 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) return endif - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = afkz*(uz(i,j,2)-uz(i,j,nz )) & + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + + do concurrent (j=1:ny, i=1:nx) local(buffer) + ! Compute r.h.s. + buffer(1) = afkz*(uz(i,j,2)-uz(i,j,nz )) & + bfkz*(uz(i,j,3)-uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1 )) & + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1 )) & + bfkz*(uz(i,j,4)-uz(i,j,nz)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & - + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*(uz(i,j,nz)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & + + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) + enddo + buffer(nz-1) = afkz*(uz(i,j,nz)-uz(i,j,nz-2)) & + bfkz*(uz(i,j,1 )-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = afkz*(uz(i,j,1)-uz(i,j,nz-1)) & + buffer(nz) = afkz*(uz(i,j,1)-uz(i,j,nz-1)) & + bfkz*(uz(i,j,2)-uz(i,j,nz-2)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derz_00 @@ -606,6 +645,8 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -614,6 +655,7 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -622,70 +664,51 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) return endif - ! Compute r.h.s. - if (ncl1==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + do concurrent (j=1:ny, i=1:nx) local(buffer) + ! Compute r.h.s. + if (ncl1==1) then + if (npaire==1) then + buffer(1) = zero + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + bfkz*(uz(i,j,4)-uz(i,j,2)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = afkz*(uz(i,j,2)+uz(i,j,2)) & + else + buffer(1) = afkz*(uz(i,j,2)+uz(i,j,2)) & + bfkz*(uz(i,j,3)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + bfkz*(uz(i,j,4)+uz(i,j,2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = af1z*uz(i,j,1) + bf1z*uz(i,j,2) & + endif + else + buffer(1) = af1z*uz(i,j,1) + bf1z*uz(i,j,2) & + cf1z*uz(i,j,3) + buffer(2) = af2z*(uz(i,j,3)-uz(i,j,1)) + endif + do concurrent (k=3:nz-2) + buffer(k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & + + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = af2z*(uz(i,j,3)-uz(i,j,1)) - enddo - endif - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & - + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) - enddo - if (ncln==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*(uz(i,j,nz )-uz(i,j,nz-2)) & + if (ncln==1) then + if (npaire==1) then + buffer(nz-1) = afkz*(uz(i,j,nz )-uz(i,j,nz-2)) & + bfkz*(uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = zero - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*( uz(i,j,nz )-uz(i,j,nz-2)) & + buffer(nz) = zero + else + buffer(nz-1) = afkz*( uz(i,j,nz )-uz(i,j,nz-2)) & + bfkz*(-uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = afkz*(-uz(i,j,nz-1)-uz(i,j,nz-1)) & + buffer(nz) = afkz*(-uz(i,j,nz-1)-uz(i,j,nz-1)) & + bfkz*(-uz(i,j,nz-2)-uz(i,j,nz-2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afmz*(uz(i,j,nz)-uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = - afnz*uz(i,j,nz) - bfnz*uz(i,j,nz-1) & + endif + else + buffer(nz-1) = afmz*(uz(i,j,nz)-uz(i,j,nz-2)) + buffer(nz) = - afnz*uz(i,j,nz) - bfnz*uz(i,j,nz-1) & - cfnz*uz(i,j,nz-2) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derz_ij @@ -757,6 +780,8 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -765,10 +790,18 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do ! Compute r.h.s. - do concurrent (k=1:nz, j=1:ny) - tx(1,j,k) = asix*(ux(2,j,k)-ux(1 ,j,k) & + do concurrent (k=1:nz, j=1:ny) local(buffer) + buffer(1) = asix*(ux(2,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx ,j,k)) & + bsix*(ux(3,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx-1,j,k)) & @@ -776,7 +809,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(1,j,k)+ux(nx-2,j,k)) & + dsix*(ux(5,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx-3,j,k)) - tx(2,j,k) = asix*(ux(3,j,k)-ux(2 ,j,k) & + buffer(2) = asix*(ux(3,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(1 ,j,k)) & + bsix*(ux(4,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(nx ,j,k)) & @@ -784,7 +817,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(2,j,k)+ux(nx-1,j,k)) & + dsix*(ux(6,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(nx-2,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3 ,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(2 ,j,k)) & + bsix*(ux(5,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(1 ,j,k)) & @@ -792,7 +825,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(3,j,k)+ux(nx,j,k)) & + dsix*(ux(7,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(nx-1,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4 ,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(3 ,j,k)) & + bsix*(ux(6,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -801,7 +834,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) + dsix*(ux(8,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(nx,j,k)) do concurrent (i=5:nx-4) - tx(i,j,k) = asix*(ux(i+1,j,k)-ux(i ,j,k) & + buffer(i) = asix*(ux(i+1,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-1,j,k)) & + bsix*(ux(i+2,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-2,j,k)) & @@ -810,7 +843,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) + dsix*(ux(i+4,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-4,j,k)) enddo - tx(nx-3,j,k) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -818,7 +851,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(ux(1 ,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -826,7 +859,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(ux(2 ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(ux(1 ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -834,7 +867,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(ux(3 ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = asix*(ux(1 ,j,k)-ux(nx ,j,k) & + buffer(nx ) = asix*(ux(1 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-1,j,k)) & + bsix*(ux(2 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-2,j,k)) & @@ -842,10 +875,13 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx,j,k)+ux(nx-3,j,k)) & + dsix*(ux(4 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derxx_00 @@ -857,6 +893,8 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -865,13 +903,14 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - tx(1,j,k) = asix*(ux(2,j,k)-ux(1,j,k) & + buffer(1) = asix*(ux(2,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(2,j,k)) & + bsix*(ux(3,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(3,j,k)) & @@ -879,7 +918,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(1,j,k)+ux(4,j,k)) & + dsix*(ux(5,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(5,j,k)) - tx(2,j,k) = asix*(ux(3,j,k)-ux(2,j,k) & + buffer(2) = asix*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) & + bsix*(ux(4,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(2,j,k)) & @@ -887,7 +926,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(2,j,k)+ux(3,j,k)) & + dsix*(ux(6,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(4,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bsix*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) & @@ -895,7 +934,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(3,j,k)+ux(2,j,k)) & + dsix*(ux(7,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(3,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bsix*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -904,8 +943,8 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) + dsix*(ux(8,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) else - tx(1,j,k) = zero - tx(2,j,k) = asix*(ux(3,j,k)-ux(2,j,k) & + buffer(1) = zero + buffer(2) = asix*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) & + bsix*(ux(4,j,k)-ux(2,j,k) & -ux(2,j,k)-ux(2,j,k)) & @@ -913,7 +952,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(2,j,k)-ux(3,j,k)) & + dsix*(ux(6,j,k)-ux(2,j,k) & -ux(2,j,k)-ux(4,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bsix*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) & @@ -921,7 +960,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(3,j,k)-ux(2,j,k)) & + dsix*(ux(7,j,k)-ux(3,j,k) & -ux(3,j,k)-ux(3,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bsix*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -931,15 +970,15 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(4,j,k)-ux(2,j,k)) endif else - tx(1,j,k) = as1x*ux(1,j,k) + bs1x*ux(2,j,k) & + buffer(1) = as1x*ux(1,j,k) + bs1x*ux(2,j,k) & + cs1x*ux(3,j,k) + ds1x*ux(4,j,k) - tx(2,j,k) = as2x*(ux(3,j,k)-ux(2,j,k) & + buffer(2) = as2x*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) - tx(3,j,k) = as3x*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = as3x*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bs3x*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) - tx(4,j,k) = as4x*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = as4x*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bs4x*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -947,7 +986,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(4,j,k)+ux(1,j,k)) endif do concurrent (i=5:nx-4) - tx(i,j,k) = asix*(ux(i+1,j,k)-ux(i ,j,k) & + buffer(i) = asix*(ux(i+1,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-1,j,k)) & + bsix*(ux(i+2,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-2,j,k)) & @@ -958,7 +997,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo if (ncln == 1) then if (npaire==1) then - tx(nx-3,j,k) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -966,7 +1005,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -974,7 +1013,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(ux(nx-2,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -982,7 +1021,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(ux(nx-3,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = asix*(ux(nx-1,j,k)-ux(nx ,j,k) & + buffer(nx ) = asix*(ux(nx-1,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-1,j,k)) & + bsix*(ux(nx-2,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-2,j,k)) & @@ -991,7 +1030,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) + dsix*(ux(nx-4,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-4,j,k)) else - tx(nx-3,j,k) = asix*( ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*( ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*( ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -999,7 +1038,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(-ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*( ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*( ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*( ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -1007,7 +1046,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(-ux(nx-2,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*( ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*( ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(-ux(nx-1,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -1015,28 +1054,31 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(-ux(nx-3,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = zero + buffer(nx ) = zero endif else - tx(nx-3,j,k) = asttx*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asttx*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsttx*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & + csttx*(ux(nx,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-6,j,k)) - tx(nx-2,j,k) = astx*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = astx*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bstx*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) - tx(nx-1,j,k) = asmx*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asmx*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) - tx(nx ,j,k) = asnx*ux(nx ,j,k) + bsnx*ux(nx-1,j,k) & + buffer(nx ) = asnx*ux(nx ,j,k) + bsnx*ux(nx-1,j,k) & + csnx*ux(nx-2,j,k) + dsnx*ux(nx-3,j,k) endif - enddo - ! Solve tri-diagonal system - call xthomas(tx, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derxx_ij @@ -1108,6 +1150,8 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -1116,51 +1160,51 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do ! Compute r.h.s. - do concurrent (k=1:nz) - do concurrent (i=1:nx) - ty(i,1,k) = asjy*(uy(i,2,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny,k)) & - + bsjy*(uy(i,3,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-1,k)) & - + csjy*(uy(i,4,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-2,k)) & - + dsjy*(uy(i,5,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny-1,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,ny,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,ny,k)) - enddo - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = asjy*(uy(i,j+1,k)-uy(i,j,k) & + do concurrent (k=1:nz, i=1:nx) local(buffer) + buffer(1) = asjy*(uy(i,2,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny,k)) & + + bsjy*(uy(i,3,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-1,k)) & + + csjy*(uy(i,4,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-2,k)) & + + dsjy*(uy(i,5,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-3,k)) + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny-1,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny-2,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,ny,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,ny-1,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,ny,k)) + do concurrent (j=5:ny-4) + buffer(j) = asjy*(uy(i,j+1,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-1,k)) & + bsjy*(uy(i,j+2,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-2,k)) & @@ -1169,51 +1213,46 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) + dsjy*(uy(i,j+4,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(uy(i,1 ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(uy(i,1 ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(uy(i,2 ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(uy(i,1 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(uy(i,2 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(uy(i,3 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asjy*(uy(i,1 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-1,k)) & - + bsjy*(uy(i,2 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-2,k)) & - + csjy*(uy(i,3 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-3,k)) & - + dsjy*(uy(i,4 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-4,k)) + buffer(ny-3) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(uy(i,1 ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(uy(i,1 ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(uy(i,2 ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(uy(i,1 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(uy(i,2 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(uy(i,3 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = asjy*(uy(i,1 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-1,k)) & + + bsjy*(uy(i,2 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-2,k)) & + + csjy*(uy(i,3 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-3,k)) & + + dsjy*(uy(i,4 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - end subroutine deryy_00 !******************************************************************** @@ -1224,6 +1263,8 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -1232,112 +1273,89 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer - ! Compute r.h.s. - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) + ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,1,k) = asjy*(uy(i,2,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,2,k)) & - + bsjy*(uy(i,3,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,3,k)) & - + csjy*(uy(i,4,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,4,k)) & - + dsjy*(uy(i,5,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,5,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,2,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,3,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) - enddo + buffer(1) = asjy*(uy(i,2,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,2,k)) & + + bsjy*(uy(i,3,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,3,k)) & + + csjy*(uy(i,4,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,4,k)) & + + dsjy*(uy(i,5,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,5,k)) + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,2,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,3,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,4,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,3,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) else - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,2,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,3,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)-uy(i,2,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)-uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)-uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)-uy(i,2,k)) - enddo + buffer(1) = zero + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,2,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,3,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,4,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)-uy(i,2,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)-uy(i,3,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)-uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)-uy(i,2,k)) endif else - do concurrent (i=1:nx) - ty(i,1,k) = as1y*uy(i,1,k) + bs1y*uy(i,2,k) & - + cs1y*uy(i,3,k) + ds1y*uy(i,4,k) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = as2y*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = as3y*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bs3y*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = as4y*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4 ,k)+uy(i,3,k)) & - + bs4y*(uy(i,6,k)-uy(i,4 ,k) & - -uy(i,4 ,k)+uy(i,2,k)) & - + cs4y*(uy(i,7,k)-uy(i,4 ,k) & - -uy(i,4 ,k)+uy(i,1,k)) - enddo + buffer(1) = as1y*uy(i,1,k) + bs1y*uy(i,2,k) & + + cs1y*uy(i,3,k) + ds1y*uy(i,4,k) + buffer(2) = as2y*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) + buffer(3) = as3y*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bs3y*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) + buffer(4) = as4y*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4 ,k)+uy(i,3,k)) & + + bs4y*(uy(i,6,k)-uy(i,4 ,k) & + -uy(i,4 ,k)+uy(i,2,k)) & + + cs4y*(uy(i,7,k)-uy(i,4 ,k) & + -uy(i,4 ,k)+uy(i,1,k)) endif - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = asjy*(uy(i,j+1,k)-uy(i,j ,k) & + do concurrent (j=5:ny-4) + buffer(j) = asjy*(uy(i,j+1,k)-uy(i,j ,k) & -uy(i,j ,k)+uy(i,j-1,k)) & + bsjy*(uy(i,j+2,k)-uy(i,j ,k) & -uy(i,j ,k)+uy(i,j-2,k)) & @@ -1348,107 +1366,88 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo if (ncln==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(uy(i,ny-2,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(uy(i,ny-2,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(uy(i,ny-3,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asjy*(uy(i,ny-1,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-1,k)) & - + bsjy*(uy(i,ny-2,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-2,k)) & - + csjy*(uy(i,ny-3,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-3,k)) & - + dsjy*(uy(i,ny-4,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-4,k)) - enddo + buffer(ny-3) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(uy(i,ny-2,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(uy(i,ny-2,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(uy(i,ny-3,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = asjy*(uy(i,ny-1,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-1,k)) & + + bsjy*(uy(i,ny-2,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-2,k)) & + + csjy*(uy(i,ny-3,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-3,k)) & + + dsjy*(uy(i,ny-4,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-4,k)) else - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*( uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*( uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(-uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(-uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*( uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*( uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(-uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(-uy(i,ny-2,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*( uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(-uy(i,ny-1,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(-uy(i,ny-2,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(-uy(i,ny-3,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - ty(i,ny ,k) = zero - enddo + buffer(ny-3) = asjy*( uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*( uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(-uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(-uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*( uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*( uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(-uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(-uy(i,ny-2,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*( uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(-uy(i,ny-1,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(-uy(i,ny-2,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(-uy(i,ny-3,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = zero endif else - do concurrent (i=1:nx) - ty(i,ny-3,k) = astty*(uy(i,ny-2,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-4,k)) & - + bstty*(uy(i,ny-1,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-5,k)) & - + cstty*(uy(i,ny,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asty*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsty*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asmy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asny*uy(i,ny ,k) + bsny*uy(i,ny-1,k) & - + csny*uy(i,ny-2,k) + dsny*uy(i,ny-3,k) - enddo + buffer(ny-3) = astty*(uy(i,ny-2,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-4,k)) & + + bstty*(uy(i,ny-1,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-5,k)) & + + cstty*(uy(i,ny,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-6,k)) + buffer(ny-2) = asty*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsty*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) + buffer(ny-1) = asmy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) + buffer(ny ) = asny*uy(i,ny ,k) + bsny*uy(i,ny-1,k) & + + csny*uy(i,ny-2,k) + dsny*uy(i,ny-3,k) endif - enddo - ! Solve tri-diagonal system - call ythomas(ty, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) + enddo + enddo end subroutine deryy_ij @@ -1520,6 +1519,8 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1528,6 +1529,14 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp + + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1536,9 +1545,9 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) return endif - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = askz*(uz(i,j,2)-uz(i,j,1 ) & + do concurrent (j=1:ny, i=1:nx) local(buffer) + ! Compute r.h.s. + buffer(1) = askz*(uz(i,j,2)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz )) & + bskz*(uz(i,j,3)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz-1)) & @@ -1546,9 +1555,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,1)+uz(i,j,nz-2)) & + dskz*(uz(i,j,5)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2 ) & + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,1 )) & + bskz*(uz(i,j,4)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,nz)) & @@ -1556,9 +1563,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,2)+uz(i,j,nz-1)) & + dskz*(uz(i,j,6)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3 ) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,2 )) & + bskz*(uz(i,j,5)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,1 )) & @@ -1566,9 +1571,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,3)+uz(i,j,nz)) & + dskz*(uz(i,j,7)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4 ) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,3 )) & + bskz*(uz(i,j,6)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,2 )) & @@ -1576,19 +1579,17 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,4)+uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,nz)) - enddo - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-1)) & - + bskz*(uz(i,j,k+2)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-2)) & - + cskz*(uz(i,j,k+3)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-3)) & - + dskz*(uz(i,j,k+4)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & + do concurrent (k=5:nz-4) + buffer(k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-1)) & + + bskz*(uz(i,j,k+2)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-2)) & + + cskz*(uz(i,j,k+3)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-3)) & + + dskz*(uz(i,j,k+4)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-4)) + enddo + buffer(nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*(uz(i,j,nz-1 )-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1596,7 +1597,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(uz(i,j,1 )-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - tz(i,j,nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1604,7 +1605,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(uz(i,j,2 )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - tz(i,j,nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(uz(i,j,1 )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1612,7 +1613,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(uz(i,j,3 )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - tz(i,j,nz ) = askz*(uz(i,j,1 )-uz(i,j,nz ) & + buffer(nz ) = askz*(uz(i,j,1 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-1)) & + bskz*(uz(i,j,2 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-2)) & @@ -1620,10 +1621,13 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz)+uz(i,j,nz-3)) & + dskz*(uz(i,j,4 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derzz_00 @@ -1635,6 +1639,8 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1643,6 +1649,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1650,11 +1657,11 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo endif - ! Compute r.h.s. - if (ncl1==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = askz*(uz(i,j,2)-uz(i,j,1) & + do concurrent (j=1:ny, i=1:nx) local(buffer) + ! Compute r.h.s. + if (ncl1==1) then + if (npaire==1) then + buffer(1) = askz*(uz(i,j,2)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,2)) & + bskz*(uz(i,j,3)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,3)) & @@ -1662,9 +1669,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,1)+uz(i,j,4)) & + dskz*(uz(i,j,5)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2) & + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) & + bskz*(uz(i,j,4)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,2)) & @@ -1672,9 +1677,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,2)+uz(i,j,3)) & + dskz*(uz(i,j,6)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bskz*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) & @@ -1682,9 +1685,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,3)+uz(i,j,2)) & + dskz*(uz(i,j,7)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,3)) & + bskz*(uz(i,j,6)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) & @@ -1692,13 +1693,9 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,4)+uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2) & + else + buffer(1) = zero + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) & + bskz*(uz(i,j,4)-uz(i,j,2) & -uz(i,j,2)-uz(i,j,2)) & @@ -1706,9 +1703,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,2)-uz(i,j,3)) & + dskz*(uz(i,j,6)-uz(i,j,2) & -uz(i,j,2)-uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bskz*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) & @@ -1716,9 +1711,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,3)-uz(i,j,2)) & + dskz*(uz(i,j,7)-uz(i,j,3) & -uz(i,j,3)-uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,3)) & + bskz*(uz(i,j,6)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) & @@ -1726,46 +1719,36 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,4)-uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4) & -uz(i,j,4)-uz(i,j,2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = as1z*uz(i,j,1) + bs1z*uz(i,j,2) & + endif + else + buffer(1) = as1z*uz(i,j,1) + bs1z*uz(i,j,2) & + cs1z*uz(i,j,3) + ds1z*uz(i,j,4) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = as2z*(uz(i,j,3)-uz(i,j,2) & + buffer(2) = as2z*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = as3z*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = as3z*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bs3z*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = as4z*(uz(i,j,5)-uz(i,j,4 ) & + buffer(4) = as4z*(uz(i,j,5)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,3)) & + bs4z*(uz(i,j,6)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,2)) & + cs4z*(uz(i,j,7)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,1)) + endif + do concurrent (k=5:nz-4) + buffer(k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-1)) & + + bskz*(uz(i,j,k+2)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-2)) & + + cskz*(uz(i,j,k+3)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-3)) & + + dskz*(uz(i,j,k+4)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-4)) enddo - endif - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-1)) & - + bskz*(uz(i,j,k+2)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-2)) & - + cskz*(uz(i,j,k+3)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-3)) & - + dskz*(uz(i,j,k+4)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-4)) - enddo - if (ncln==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & + if (ncln==1) then + if (npaire==1) then + buffer(nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*(uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1773,9 +1756,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1783,9 +1764,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(uz(i,j,nz-2)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(uz(i,j,nz-1)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1793,9 +1772,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(uz(i,j,nz-3)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = askz*(uz(i,j,nz-1)-uz(i,j,nz ) & + buffer(nz ) = askz*(uz(i,j,nz-1)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-1)) & + bskz*(uz(i,j,nz-2)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-2)) & @@ -1803,10 +1780,8 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz )+uz(i,j,nz-3)) & + dskz*(uz(i,j,nz-4)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-4)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*( uz(i,j,nz-2)-uz(i,j,nz-3) & + else + buffer(nz-3) = askz*( uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*( uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1814,9 +1789,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(-uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = askz*( uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*( uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*( uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1824,9 +1797,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(-uz(i,j,nz-2)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = askz*( uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*( uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(-uz(i,j,nz-1)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1834,38 +1805,31 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(-uz(i,j,nz-3)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = zero - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = asttz*(uz(i,j,nz-2)-uz(i,j,nz-3 ) & + buffer(nz ) = zero + endif + else + buffer(nz-3) = asttz*(uz(i,j,nz-2)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-4)) & + bsttz*(uz(i,j,nz-1)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-5)) & + csttz*(uz(i,j,nz)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = astz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = astz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bstz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = asmz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = asmz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = asnz*uz(i,j,nz ) + bsnz*uz(i,j,nz-1) & + buffer(nz ) = asnz*uz(i,j,nz ) + bsnz*uz(i,j,nz-1) & + csnz*uz(i,j,nz-2) + dsnz*uz(i,j,nz-3) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, nz) + do concurrent(k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derzz_ij diff --git a/src/x3d_operator_1d.f90 b/src/x3d_operator_1d.f90 index e006fd2..fd0bdea 100644 --- a/src/x3d_operator_1d.f90 +++ b/src/x3d_operator_1d.f90 @@ -560,7 +560,7 @@ subroutine first_deriv_imp_(alfa1, af1, bf1, cf1, df1, alfa2, af2, alfan, afn, b integer, intent(in) :: n, ncl1, ncln real(mytype), dimension(n), intent(out) :: ff, fs, fw, ffp, fsp, fwp real(mytype), intent(in) :: alfa1, af1, bf1, cf1, df1, alfa2, af2, alfan, afn, bfn, & - cfn, dfn, alfam, afm, alfai, afi, bfi + cfn, dfn, alfam, afm, alfai, afi, bfi integer :: i real(mytype), dimension(n) :: fb, fc @@ -793,11 +793,11 @@ subroutine second_deriv_imp_(alsa1, as1, bs1, & integer, intent(in) :: n, ncl1, ncln real(mytype), dimension(n), intent(out) :: sf, ss, sw, sfp, ssp, swp real(mytype), intent(in) :: alsa1, as1, bs1, & - cs1, ds1, alsa2, as2, alsan, asn, bsn, csn, dsn, alsam, & - asm, alsa3, as3, bs3, alsat, ast, bst, & - alsa4, as4, bs4, cs4, & - alsatt, astt, bstt, cstt, & - alsai, asi, bsi, csi, dsi + cs1, ds1, alsa2, as2, alsan, asn, bsn, csn, dsn, alsam, & + asm, alsa3, as3, bs3, alsat, ast, bst, & + alsa4, as4, bs4, cs4, & + alsatt, astt, bstt, cstt, & + alsai, asi, bsi, csi, dsi integer :: i real(mytype), dimension(n) :: sb, sc @@ -993,14 +993,14 @@ subroutine interpol_imp_(dx, nxm, nx, nclx1, nclxn, & real(mytype), intent(in) :: dx integer, intent(in) :: nxm, nx, nclx1, nclxn - real(mytype) :: alcaix6, acix6, bcix6 - real(mytype) :: ailcaix6, aicix6, bicix6, cicix6, dicix6 - real(mytype), dimension(nxm) :: cfx6, ccx6, cbx6, cfxp6, ciwxp6, csxp6, & - cwxp6, csx6, cwx6, cifx6, cicx6, cisx6 - real(mytype), dimension(nxm) :: cibx6, cifxp6, cisxp6, ciwx6 - real(mytype), dimension(nx) :: cfi6, cci6, cbi6, cfip6, csip6, cwip6, csi6, & - cwi6, cifi6, cici6, cibi6, cifip6 - real(mytype), dimension(nx) :: cisip6, ciwip6, cisi6, ciwi6 + real(mytype), intent(in) :: alcaix6, acix6, bcix6 + real(mytype), intent(in) :: ailcaix6, aicix6, bicix6, cicix6, dicix6 + real(mytype), dimension(nxm), intent(out) :: cfx6, ccx6, cbx6, cfxp6, ciwxp6, csxp6, & + cwxp6, csx6, cwx6, cifx6, cicx6, cisx6 + real(mytype), dimension(nxm), intent(out) :: cibx6, cifxp6, cisxp6, ciwx6 + real(mytype), dimension(nx), intent(out) :: cfi6, cci6, cbi6, cfip6, csip6, cwip6, csi6, & + cwi6, cifi6, cici6, cibi6, cifip6 + real(mytype), dimension(nx), intent(out) :: cisip6, ciwip6, cisi6, ciwi6 integer :: i diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index 07936cf..ecbdc53 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -22,78 +22,89 @@ module x3d_staggered ! subroutine derxvp(tx,ux,x3dop,nx,nxm,ny,nz) - use x3d_operator_x_data - - implicit none - - ! Arguments - integer, intent(in) :: nx, nxm, ny, nz - real(mytype), intent(out), dimension(nxm,ny,nz) :: tx - real(mytype), intent(in), dimension(nx,ny,nz) :: ux - type(x3doperator1d), intent(in) :: x3dop - - ! Local variables - integer :: i, j, k - - if (nclx) then - ! nxm = nx - do concurrent (k=1:nz, j=1:ny) - - ! Compute r.h.s. - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & - + bcix6*(ux(3,j,k)-ux(nx,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & - + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) - enddo - tx(nx-1,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & - + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) - tx(nx ,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & - + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - enddo - - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - - else - ! nxm = nx-1 - do concurrent (k=1:nz, j=1:ny) - - ! Compute r.h.s. - if (x3dop%npaire==1) then - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & - + bcix6*(ux(3,j,k)-ux(2,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - else - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & - + bcix6*(ux(3,j,k)-two*ux(1,j,k)+ux(2,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - endif - do concurrent (i=3:nxm-2) - tx(i,j,k) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & - + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) - enddo - if (x3dop%npaire==1) then - tx(nxm-1,j,k) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & - + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - tx(nxm,j,k) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & - + bcix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) - else - tx(nxm-1,j,k) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + use x3d_operator_x_data + + implicit none + + !$acc routine(thomas1d_0, thomas1d_12) seq + + ! Arguments + integer, intent(in) :: nx, nxm, ny, nz + real(mytype), intent(out), dimension(nxm,ny,nz) :: tx + real(mytype), intent(in), dimension(nx,ny,nz) :: ux + type(x3doperator1d), intent(in) :: x3dop + + ! Local variables + integer :: i, j, k + real(mytype), dimension(nxm) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nxm) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do + + if (nclx) then + ! nxm = nx + do concurrent (k=1:nz, j=1:ny) local(buffer) + ! Compute r.h.s. + buffer(1) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + + bcix6*(ux(3,j,k)-ux(nx,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) + enddo + buffer(nx-1) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) + buffer(nx ) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo + + else + ! nxm = nx-1 + do concurrent (k=1:nz, j=1:ny) + + ! Compute r.h.s. + if (x3dop%npaire==1) then + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + + bcix6*(ux(3,j,k)-ux(2,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + else + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + + bcix6*(ux(3,j,k)-two*ux(1,j,k)+ux(2,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + endif + do concurrent (i=3:nxm-2) + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) + enddo + if (x3dop%npaire==1) then + buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) + buffer(nxm) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - tx(nxm,j,k) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & - + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) - endif - enddo - - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) - - endif + buffer(nxm) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & + + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) + endif + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nxm) + do concurrent (i=1:nxm) + tx(i,j,k) = buffer(i) + enddo + enddo + endif end subroutine derxvp @@ -105,6 +116,8 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nxm,ny,nz) :: tx @@ -113,91 +126,105 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nxm) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nxm) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + buffer(1) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + bicix6*(ux(3,j,k)+ux(nx,j,k)) & + cicix6*(ux(4,j,k)+ux(nx-1,j,k)) & + dicix6*(ux(5,j,k)+ux(nx-2,j,k)) - tx(2,j,k) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + buffer(2) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(nx,j,k)) & + dicix6*(ux(6,j,k)+ux(nx-1,j,k)) - tx(3,j,k) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + buffer(3) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(nx,j,k)) do concurrent (i=4:nx-4) - tx(i,j,k) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + buffer(i) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + bicix6*(ux(i+2,j,k)+ux(i-1,j,k)) & + cicix6*(ux(i+3,j,k)+ux(i-2,j,k)) & + dicix6*(ux(i+4,j,k)+ux(i-3,j,k)) enddo - tx(nx-3,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-3) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(1,j,k)+ux(nx-6,j,k)) - tx(nx-2,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx ,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(1,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(2,j,k)+ux(nx-5,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + buffer(nx-1) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(1 ,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(2,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) - tx(nx ,j,k) = aicix6*(ux(1,j,k)+ux(nx,j,k)) & + buffer(nx ) = aicix6*(ux(1,j,k)+ux(nx,j,k)) & + bicix6*(ux(2,j,k)+ux(nx-1,j,k)) & + cicix6*(ux(3,j,k)+ux(nx-2,j,k)) & + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + buffer(1) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + bicix6*(ux(3,j,k)+ux(2,j,k)) & + cicix6*(ux(4,j,k)+ux(3,j,k)) & + dicix6*(ux(5,j,k)+ux(4,j,k)) - tx(2,j,k) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + buffer(2) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(2,j,k)) & + dicix6*(ux(6,j,k)+ux(3,j,k)) - tx(3,j,k) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + buffer(3) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(2,j,k)) do concurrent (i=4:nxm-3) - tx(i,j,k) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + buffer(i) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + bicix6*(ux(i+2,j,k)+ux(i-1,j,k)) & + cicix6*(ux(i+3,j,k)+ux(i-2,j,k)) & + dicix6*(ux(i+4,j,k)+ux(i-3,j,k)) enddo - tx(nxm-2,j,k) = aicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + buffer(nxm-2) = aicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + bicix6*(ux(nxm,j,k)+ux(nxm-3,j,k)) & + cicix6*(ux(nx,j,k)+ux(nxm-4,j,k)) & + dicix6*(ux(nxm,j,k)+ux(nxm-5,j,k)) - tx(nxm-1,j,k) = aicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + buffer(nxm-1) = aicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + bicix6*(ux(nx,j,k)+ux(nxm-2,j,k)) & + cicix6*(ux(nxm,j,k)+ux(nxm-3,j,k)) & + dicix6*(ux(nxm-1,j,k)+ux(nxm-4,j,k)) - tx(nxm ,j,k) = aicix6*(ux(nx,j,k)+ux(nxm,j,k)) & + buffer(nxm ) = aicix6*(ux(nx,j,k)+ux(nxm,j,k)) & + bicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + cicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + dicix6*(ux(nxm-2,j,k)+ux(nxm-3,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nxm) + do concurrent (i=1:nxm) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -212,6 +239,8 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -220,49 +249,63 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + buffer(1) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - tx(2,j,k) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + buffer(2) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + bcix6*(ux(3,j,k)-ux(nx,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + buffer(i) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + bcix6*(ux(i+1,j,k)-ux(i-2,j,k)) enddo - tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nx-3,j,k)) - tx(nx ,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + buffer(nx ) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + bcix6*(ux(1,j,k)-ux(nx-2,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = zero - tx(2,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & + buffer(1) = zero + buffer(2) = acix6*(ux(2,j,k)-ux(1,j,k)) & + bcix6*(ux(3,j,k)-ux(1,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + buffer(i) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + bcix6*(ux(i+1,j,k)-ux(i-2,j,k)) enddo - tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + bcix6*(ux(nx-1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = zero - enddo + buffer(nx) = zero - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -277,6 +320,8 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -285,99 +330,113 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(1,j,k)+ux(nx ,j,k)) & + buffer(1) = aicix6*(ux(1,j,k)+ux(nx ,j,k)) & + bicix6*(ux(2,j,k)+ux(nx-1,j,k)) & + cicix6*(ux(3,j,k)+ux(nx-2,j,k)) & + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) - tx(2,j,k) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + buffer(2) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + bicix6*(ux(3,j,k)+ux(nx,j,k)) & + cicix6*(ux(4,j,k)+ux(nx-1,j,k)) & + dicix6*(ux(5,j,k)+ux(nx-2,j,k)) - tx(3,j,k) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + buffer(3) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(nx,j,k)) & + dicix6*(ux(6,j,k)+ux(nx-1,j,k)) - tx(4,j,k) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + buffer(4) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(nx,j,k)) do concurrent (i=5:nx-3) - tx(i,j,k) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + buffer(i) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + bicix6*(ux(i+1,j,k)+ux(i-2,j,k)) & + cicix6*(ux(i+2,j,k)+ux(i-3,j,k)) & + dicix6*(ux(i+3,j,k)+ux(i-4,j,k)) enddo - tx(nx-2,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(1,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-1) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx ,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(1,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(2,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + buffer(nx ) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(1,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(2,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(1,j,k)+ux(1,j,k)) & + buffer(1) = aicix6*(ux(1,j,k)+ux(1,j,k)) & + bicix6*(ux(2,j,k)+ux(2,j,k)) & + cicix6*(ux(3,j,k)+ux(3,j,k)) & + dicix6*(ux(4,j,k)+ux(4,j,k)) - tx(2,j,k) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + buffer(2) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + bicix6*(ux(3,j,k)+ux(1,j,k)) & + cicix6*(ux(4,j,k)+ux(2,j,k)) & + dicix6*(ux(5,j,k)+ux(3,j,k)) - tx(3,j,k) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + buffer(3) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(1,j,k)) & + dicix6*(ux(6,j,k)+ux(2,j,k)) - tx(4,j,k) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + buffer(4) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(1,j,k)) do concurrent (i=5:nx-4) - tx(i,j,k) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + buffer(i) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + bicix6*(ux(i+1,j,k)+ux(i-2,j,k)) & + cicix6*(ux(i+2,j,k)+ux(i-3,j,k)) & + dicix6*(ux(i+3,j,k)+ux(i-4,j,k)) enddo - tx(nx-3,j,k) = aicix6*(ux(nx-3,j,k)+ux(nx-4,j,k)) & + buffer(nx-3) = aicix6*(ux(nx-3,j,k)+ux(nx-4,j,k)) & + bicix6*(ux(nx-2,j,k)+ux(nx-5,j,k)) & + cicix6*(ux(nx-1,j,k)+ux(nx-6,j,k)) & + dicix6*(ux(nx-1,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx-1,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-1) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(nx-2,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(nx-3,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-1,j,k)) & + buffer(nx ) = aicix6*(ux(nx-1,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(nx-2,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(nx-3,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(nx-4,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -392,6 +451,8 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,nym,nz) :: ty @@ -400,118 +461,106 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nym) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:nym) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,ny,k)) & - + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & - + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & - + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,ny,k)) - enddo - do concurrent (j=4:ny-4, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,ny,k)) & + + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & + + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) + buffer(2) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & + + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) + buffer(3) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,ny,k)) + do concurrent (j=4:ny-4) + buffer(j) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + biciy6*(uy(i,j+2,k)+uy(i,j-1,k)) & + ciciy6*(uy(i,j+3,k)+uy(i,j-2,k)) & + diciy6*(uy(i,j+4,k)+uy(i,j-3,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & - + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & - + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & - + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + buffer(ny-3) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) + buffer(ny-1) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + buffer(ny ) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & + + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & + + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & + + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,2,k)) & - + ciciy6*(uy(i,4,k)+uy(i,3,k)) & - + diciy6*(uy(i,5,k)+uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,2,k)) & - + diciy6*(uy(i,6,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,2,k)) - enddo - do concurrent (j=4:nym-3, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,2,k)) & + + ciciy6*(uy(i,4,k)+uy(i,3,k)) & + + diciy6*(uy(i,5,k)+uy(i,4,k)) + buffer(2) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,2,k)) & + + diciy6*(uy(i,6,k)+uy(i,3,k)) + buffer(3) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,2,k)) + do concurrent (j=4:nym-3) + buffer(j) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + biciy6*(uy(i,j+2,k)+uy(i,j-1,k)) & + ciciy6*(uy(i,j+3,k)+uy(i,j-2,k)) & + diciy6*(uy(i,j+4,k)+uy(i,j-3,k)) enddo - do concurrent (i=1:nx) - ty(i,nym-2,k) = aiciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & - + biciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,nym-4,k)) & - + diciy6*(uy(i,nym,k)+uy(i,nym-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym-1,k) = aiciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & - + biciy6*(uy(i,ny,k)+uy(i,nym-2,k)) & - + ciciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & - + diciy6*(uy(i,nym-1,k)+uy(i,nym-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym ,k) = aiciy6*(uy(i,ny,k)+uy(i,nym,k)) & - + biciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & - + ciciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & - + diciy6*(uy(i,nym-2,k)+uy(i,nym-3,k)) + buffer(nym-2) = aiciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & + + biciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,nym-4,k)) & + + diciy6*(uy(i,nym,k)+uy(i,nym-5,k)) + buffer(nym-1) = aiciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & + + biciy6*(uy(i,ny,k)+uy(i,nym-2,k)) & + + ciciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & + + diciy6*(uy(i,nym-1,k)+uy(i,nym-4,k)) + buffer(nym ) = aiciy6*(uy(i,ny,k)+uy(i,nym,k)) & + + biciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & + + ciciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & + + diciy6*(uy(i,nym-2,k)+uy(i,nym-3,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nym) + do concurrent (j=1:nym) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, nym, nz) - endif endif @@ -525,6 +574,8 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,nym,nz) :: ty @@ -534,77 +585,79 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nym) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:nym) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,ny,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,3,k)-uy(i,2,k)) & - + bciy6*(uy(i,4,k)-uy(i,1,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,ny,k)) + buffer(2) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + + bciy6*(uy(i,4,k)-uy(i,1,k)) + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + bciy6*(uy(i,j+2,k)-uy(i,j-1,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & - + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & - + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + buffer(ny-1) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & + + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + buffer(ny ) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & + + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + if (istret /= 0) then + do concurrent (j=1:nym) + buffer(j) = buffer(j) * ppyi(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==0) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + bciy6*(uy(i,3,k)-two*uy(i,1,k)+uy(i,2,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + buffer(2) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + bciy6*(uy(i,4,k)-uy(i,1,k)) - enddo - do concurrent (j=3:nym-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + do concurrent (j=3:nym-2) + buffer(j) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + bciy6*(uy(i,j+2,k)-uy(i,j-1,k)) enddo - do concurrent (i=1:nx) - ty(i,nym-1,k) = aciy6*(uy(i,nym,k)-uy(i,nym-1,k)) & + buffer(nym-1) = aciy6*(uy(i,nym,k)-uy(i,nym-1,k)) & + bciy6*(uy(i,ny,k)-uy(i,nym-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym ,k) = aciy6*(uy(i,ny,k)-uy(i,nym,k)) & + buffer(nym ) = aciy6*(uy(i,ny,k)-uy(i,nym,k)) & + bciy6*(two*uy(i,ny,k)-uy(i,nym,k)-uy(i,nym-1,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nym) + if (istret /= 0) then + do concurrent (j=1:nym) + buffer(j) = buffer(j) * ppyi(j) + enddo + endif + do concurrent (j=1:nym) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, nym, nz) - endif endif - if (istret /= 0) then - do concurrent (k=1:nz, j=1:nym, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppyi(j) - enddo - endif - end subroutine deryvp !******************************************************************** @@ -615,6 +668,8 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -623,130 +678,114 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & - + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & - + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & - + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,ny,k)) & - + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & - + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & - + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,ny,k)) - enddo - do concurrent (j=5:ny-3, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + buffer(1) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & + + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & + + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & + + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + buffer(2) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,ny,k)) & + + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & + + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) + buffer(3) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & + + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) + buffer(4) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,ny,k)) + do concurrent (j=5:ny-3) + buffer(j) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + biciy6*(uy(i,j+1,k)+uy(i,j-2,k)) & + ciciy6*(uy(i,j+2,k)+uy(i,j-3,k)) & + diciy6*(uy(i,j+3,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) + buffer(ny-1) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) + buffer(ny ) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,1,k)+uy(i,1,k)) & - + biciy6*(uy(i,2,k)+uy(i,2,k)) & - + ciciy6*(uy(i,3,k)+uy(i,3,k)) & - + diciy6*(uy(i,4,k)+uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,1,k)) & - + ciciy6*(uy(i,4,k)+uy(i,2,k)) & - + diciy6*(uy(i,5,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,1,k)) & - + diciy6*(uy(i,6,k)+uy(i,2,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,1,k)) - enddo - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + buffer(1) = aiciy6*(uy(i,1,k)+uy(i,1,k)) & + + biciy6*(uy(i,2,k)+uy(i,2,k)) & + + ciciy6*(uy(i,3,k)+uy(i,3,k)) & + + diciy6*(uy(i,4,k)+uy(i,4,k)) + buffer(2) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,1,k)) & + + ciciy6*(uy(i,4,k)+uy(i,2,k)) & + + diciy6*(uy(i,5,k)+uy(i,3,k)) + buffer(3) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,1,k)) & + + diciy6*(uy(i,6,k)+uy(i,2,k)) + buffer(4) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,1,k)) + do concurrent (j=5:ny-4) + buffer(j) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + biciy6*(uy(i,j+1,k)+uy(i,j-2,k)) & + ciciy6*(uy(i,j+2,k)+uy(i,j-3,k)) & + diciy6*(uy(i,j+3,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = aiciy6*(uy(i,ny-3,k)+uy(i,ny-4,k)) & - + biciy6*(uy(i,ny-2,k)+uy(i,ny-5,k)) & - + ciciy6*(uy(i,ny-1,k)+uy(i,ny-6,k)) & - + diciy6*(uy(i,ny-1,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny-1,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,ny-2,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,ny-3,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,ny-2,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,ny-3,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,ny-4,k)+uy(i,ny-4,k)) + buffer(ny-3) = aiciy6*(uy(i,ny-3,k)+uy(i,ny-4,k)) & + + biciy6*(uy(i,ny-2,k)+uy(i,ny-5,k)) & + + ciciy6*(uy(i,ny-1,k)+uy(i,ny-6,k)) & + + diciy6*(uy(i,ny-1,k)+uy(i,ny-7,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny-1,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,ny-2,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,ny-3,k)+uy(i,ny-5,k)) + buffer(ny ) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,ny-2,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,ny-3,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,ny-4,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) - endif endif @@ -760,6 +799,8 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -769,75 +810,77 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & - + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,ny,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + buffer(1) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & + + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + buffer(2) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,ny,k)) + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + bciy6*(uy(i,j+1,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & - + bciy6*(uy(i,ny,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & - + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + buffer(ny-1) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & + + bciy6*(uy(i,ny,k)-uy(i,ny-3,k)) + buffer(ny) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & + + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,1,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + buffer(1) = zero + buffer(2) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,1,k)) + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + bciy6*(uy(i,j+1,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & - + bciy6*(uy(i,ny-1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = zero + buffer(ny-1) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & + + bciy6*(uy(i,ny-1,k)-uy(i,ny-3,k)) + buffer(ny) = zero + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, ny) + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) - endif endif - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) - enddo - endif - end subroutine derypv !******************************************************************** @@ -848,6 +891,8 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nzm) :: tz @@ -856,6 +901,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k + real(mytype), dimension(nzm) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -864,83 +910,76 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) return endif + do concurrent (k=1:nzm) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + ! Compute r.h.s. + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,nz)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + bciz6*(uz(i,j,4)-uz(i,j,1)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & - + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & + + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(uz(i,j,1)-uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + buffer(nz ) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + bciz6*(uz(i,j,2)-uz(i,j,nz-1)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + ! Compute r.h.s. + if (x3dop%npaire==1) then + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2))& + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2))& + bciz6*(uz(i,j,4)-uz(i,j,1)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + else + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-two*uz(i,j,1)+uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + bciz6*(uz(i,j,4)-uz(i,j,1)) + endif + do concurrent (k=3:nzm-2) + buffer(k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & + + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) enddo - endif - do concurrent (k=3:nzm-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & - + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) - enddo - if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) & + if (x3dop%npaire==1) then + buffer(nzm-1) = aciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) & + bciz6*(uz(i,j,nz)-uz(i,j,nzm-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nzm)) & + buffer(nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nzm)) & + bciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + else + buffer(nzm-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + buffer(nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(two*uz(i,j,nz)-uz(i,j,nz-1)-uz(i,j,nz-2)) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nzm) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nzm) + do concurrent (k=1:nzm) + tz(i,j,k) = buffer(k) + enddo + enddo endif @@ -954,6 +993,8 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nzm) :: tz @@ -962,6 +1003,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k + real(mytype), dimension(nzm) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -970,112 +1012,103 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) return endif + do concurrent (k=1:nzm) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,nz)) & + ciciz6*(uz(i,j,4)+uz(i,j,nz-1)) & + diciz6*(uz(i,j,5)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,nz)) & + diciz6*(uz(i,j,6)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,nz)) - enddo - do concurrent (k=4:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & - + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & - + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & - + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + do concurrent (k=4:nz-4) + buffer(k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & + + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & + + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & + + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) + enddo + buffer(nz-3) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,1)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-2) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,1)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,2)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,1)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,2)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,3)+uz(i,j,nz-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + buffer(nz ) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + biciz6*(uz(i,j,2)+uz(i,j,nz-1)) & + ciciz6*(uz(i,j,3)+uz(i,j,nz-2)) & + diciz6*(uz(i,j,4)+uz(i,j,nz-3)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,2)) & + ciciz6*(uz(i,j,4)+uz(i,j,3)) & + diciz6*(uz(i,j,5)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,2)) & + diciz6*(uz(i,j,6)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,2)) - enddo - do concurrent (k=4:nzm-3, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & - + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & - + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & - + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-2) = aiciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + do concurrent (k=4:nzm-3) + buffer(k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & + + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & + + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & + + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) + enddo + buffer(nzm-2) = aiciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + biciz6*(uz(i,j,nzm)+uz(i,j,nzm-3)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nzm-4)) & + diciz6*(uz(i,j,nzm)+uz(i,j,nzm-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aiciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + buffer(nzm-1) = aiciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + biciz6*(uz(i,j,nz)+uz(i,j,nzm-2)) & + ciciz6*(uz(i,j,nzm)+uz(i,j,nzm-3)) & + diciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm) = aiciz6*(uz(i,j,nz)+uz(i,j,nzm)) & + buffer(nzm) = aiciz6*(uz(i,j,nz)+uz(i,j,nzm)) & + biciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + ciciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + diciz6*(uz(i,j,nzm-2)+uz(i,j,nzm-3)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nzm) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nzm) + do concurrent (k=1:nzm) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif @@ -1090,6 +1123,8 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nzm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1098,6 +1133,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1106,60 +1142,61 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) return endif + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + ! Compute r.h.s. + buffer(1) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + bciz6*(uz(i,j,2)-uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + buffer(2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,nz)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & - + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & + + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + buffer(nz) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(uz(i,j,1)-uz(i,j,nz-2)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + ! Compute r.h.s. + buffer(1) = zero + buffer(2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,1)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & - + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & + + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = zero - enddo + buffer(nz) = zero - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif @@ -1174,6 +1211,8 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1182,6 +1221,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1190,124 +1230,111 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) return endif + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + biciz6*(uz(i,j,2)+uz(i,j,nz-1)) & + ciciz6*(uz(i,j,3)+uz(i,j,nz-2)) & + diciz6*(uz(i,j,4)+uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + buffer(2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,nz)) & + ciciz6*(uz(i,j,4)+uz(i,j,nz-1)) & + diciz6*(uz(i,j,5)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,nz)) & + diciz6*(uz(i,j,6)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,nz)) - enddo - do concurrent (k=5:nz-3, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & - + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & - + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & - + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + do concurrent (k=5:nz-3) + buffer(k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & + + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & + + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & + + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) + enddo + buffer(nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,1)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,1)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,2)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + buffer(nz ) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,1)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,2)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,3)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then + do concurrent (j=1:ny, i=1:nx) local(buffer) - ! Compute r.h.s. - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,1)+uz(i,j,1)) & + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,1)) & + biciz6*(uz(i,j,2)+uz(i,j,2)) & + ciciz6*(uz(i,j,3)+uz(i,j,3)) & + diciz6*(uz(i,j,4)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + buffer(2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,1))& + ciciz6*(uz(i,j,4)+uz(i,j,2))& + diciz6*(uz(i,j,5)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,1)) & + diciz6*(uz(i,j,6)+uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,1)) - enddo - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & - + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & - + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & - + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = aiciz6*(uz(i,j,nz-3)+uz(i,j,nz-4)) & + do concurrent (k=5:nz-4) + buffer(k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & + + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & + + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & + + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) + enddo + buffer(nz-3) = aiciz6*(uz(i,j,nz-3)+uz(i,j,nz-4)) & + biciz6*(uz(i,j,nz-2)+uz(i,j,nz-5)) & + ciciz6*(uz(i,j,nz-1)+uz(i,j,nz-6)) & + diciz6*(uz(i,j,nz-1)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + buffer(nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz-1)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,nz-2)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,nz-3)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-1)) & + buffer(nz ) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,nz-2)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,nz-3)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,nz-4)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif diff --git a/src/xcompact3d.f90 b/src/xcompact3d.f90 index 031fa64..0ecc95b 100644 --- a/src/xcompact3d.f90 +++ b/src/xcompact3d.f90 @@ -13,6 +13,7 @@ program xcompact3d use navier, only : solve_poisson, cor_vel use case use time_integrators, only : int_time + use nvtx implicit none @@ -23,9 +24,7 @@ program xcompact3d integer :: code call boot_xcompact3d() - call init_xcompact3d(ndt_max) - call case_init(ux1, uy1, uz1) telapsed = 0 @@ -36,12 +35,12 @@ program xcompact3d itr = 1 ! no inner iterations !call init_flowfield() - - tstart = MPI_Wtime() - call case_bc(ux1, uy1, uz1) - + call nvtxStartRange("transeq") + tstart = MPI_Wtime() call calculate_transeq_rhs(dux1,duy1,duz1,ux1,uy1,uz1) + tend = MPI_Wtime() + call nvtxEndRange call int_time(ux1,uy1,uz1,dux1,duy1,duz1) !do concurrent (k=1:zsize(3), j=1:zsize(2), i=1:zsize(1)) @@ -50,7 +49,6 @@ program xcompact3d call solve_poisson(pp3,px1,py1,pz1,ux1,uy1,uz1) call cor_vel(ux1,uy1,uz1,px1,py1,pz1) - tend = MPI_Wtime() telapsed = telapsed + (tend - tstart) tmin = telapsed tmax = telapsed