From 9d25bf6c73726bac9ee1628e98ec325d5d6f53df Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 26 Jun 2017 16:27:40 -0700 Subject: [PATCH] v_of_rho (in particular configuration) and and some more ffts ported to GPU. Also added some missing deallocations to PW cleanup. --- PW/src/clean_pw.f90 | 8 +- PW/src/electrons.f90 | 43 ++- PW/src/gradcorr.f90 | 606 ++++++++++++++++++++++++++++--------------- PW/src/scf_mod.f90 | 13 +- PW/src/set_vrs.f90 | 81 ++++++ PW/src/sum_band.f90 | 30 ++- PW/src/v_of_rho.f90 | 499 ++++++++++++++++++++--------------- 7 files changed, 836 insertions(+), 444 deletions(-) diff --git a/PW/src/clean_pw.f90 b/PW/src/clean_pw.f90 index e0709b3d3..763d4e82c 100644 --- a/PW/src/clean_pw.f90 +++ b/PW/src/clean_pw.f90 @@ -73,7 +73,7 @@ SUBROUTINE clean_pw( lflag ) eigts1_d, eigts2_d, eigts3_d USE gvecs, ONLY : nls_d USE wvfct, ONLY : g2kin_d, et_d, wg_d - USE us, ONLY : qrad_d + USE us, ONLY : qrad_d, tab_d, tab_d2y_d #endif IMPLICIT NONE ! @@ -150,6 +150,9 @@ SUBROUTINE clean_pw( lflag ) IF ( ALLOCATED( vrs ) ) DEALLOCATE( vrs ) if (spline_ps) then IF ( ALLOCATED( tab_d2y) ) DEALLOCATE( tab_d2y ) +#ifdef USE_CUDA + IF ( ALLOCATED( tab_d2y_d) ) DEALLOCATE( tab_d2y_d ) +#endif endif IF ( ALLOCATED( nls ) ) DEALLOCATE( nls ) IF ( ALLOCATED( nlsm ) ) DEALLOCATE( nlsm ) @@ -178,6 +181,9 @@ SUBROUTINE clean_pw( lflag ) IF ( ALLOCATED( qrad ) ) DEALLOCATE( qrad ) IF ( ALLOCATED( tab ) ) DEALLOCATE( tab ) IF ( ALLOCATED( tab_at ) ) DEALLOCATE( tab_at ) +#ifdef USE_CUDA + IF ( ALLOCATED( tab_d ) ) DEALLOCATE( tab_d ) +#endif IF ( lspinorb ) THEN IF ( ALLOCATED( fcoef ) ) DEALLOCATE( fcoef ) END IF diff --git a/PW/src/electrons.f90 b/PW/src/electrons.f90 index e34ef062e..dc47d94b1 100644 --- a/PW/src/electrons.f90 +++ b/PW/src/electrons.f90 @@ -372,9 +372,10 @@ SUBROUTINE electrons_scf ( printout, exxen ) USE plugin_variables, ONLY : plugin_etot ! #ifdef USE_CUDA + USE funct, ONLY : get_iexch, get_icorr, get_igcx, get_igcc USE dfunct, ONLY : newd_gpu USE cudafor - USE scf, ONLY : rho_core_d, rhog_core_d, vltot_d, vrs_d + USE scf, ONLY : rho_core_d, rhog_core_d, vltot_d, vrs_d #endif IMPLICIT NONE ! @@ -402,7 +403,7 @@ SUBROUTINE electrons_scf ( printout, exxen ) LOGICAL :: & first, exst #ifdef USE_CUDA - INTEGER :: istat + INTEGER :: istat, iexch, icorr, igcx, igcc #endif ! ! ... auxiliary variables for calculating and storing temporary copies of @@ -649,9 +650,32 @@ SUBROUTINE electrons_scf ( printout, exxen ) ehart, etxc, vtxc, eth, etotefield, charge, v) !CALL v_of_rho( rhoin, rho_core, rhog_core, & ! ehart, etxc, vtxc, eth, etotefield, charge, v) +#else +#ifdef USE_CUDA + iexch = get_iexch + icorr = get_icorr + igcx = get_igcx + igcc = get_igcc + + ! If calling PBE functional configuration, use GPU path + if (iexch .eq. 1 .and. icorr .eq. 4 .and. igcx .eq. 3 .and. igcc .eq. 4) then + rho_core_d = rho_core + rhog_core_d = rhog_core + v%of_r_d = v%of_r + rhoin%of_r_d = rhoin%of_r + rhoin%of_g_d = rhoin%of_g + CALL v_of_rho_gpu( rhoin, rho_core, rho_core_d, rhog_core, rhog_core_d,& + ehart, etxc, vtxc, eth, etotefield, charge, v) + + ! Otherwise, fallback to CPU path + else + CALL v_of_rho( rhoin, rho_core, rhog_core, & + ehart, etxc, vtxc, eth, etotefield, charge, v) + endif #else CALL v_of_rho( rhoin, rho_core, rhog_core, & ehart, etxc, vtxc, eth, etotefield, charge, v) +#endif #endif IF (okpaw) THEN CALL PAW_potential(rhoin%bec, ddd_paw, epaw,etot_cmp_paw) @@ -708,15 +732,24 @@ SUBROUTINE electrons_scf ( printout, exxen ) ! ! ... define the total local potential (external + scf) ! +#ifdef USE_CUDA + v%of_r_d = v%of_r + vltot_d = vltot + CALL sum_vrs_gpu( dfftp%nnr, nspin, vltot_d, v%of_r_d, vrs_d ) +#else CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs ) +#endif ! ! ... interpolate the total local potential ! - CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs ) - ! #ifdef USE_CUDA - vrs_d = vrs + CALL interpolate_vrs_gpu( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs_d) + vrs = vrs_d +#else + CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs ) #endif + !vrs_d = vrs + ! ! ... in the US case we have to recompute the self-consistent ! ... term in the nonlocal potential ! ... PAW: newd contains PAW updates of NL coefficients diff --git a/PW/src/gradcorr.f90 b/PW/src/gradcorr.f90 index e028f1902..00cc4dfac 100644 --- a/PW/src/gradcorr.f90 +++ b/PW/src/gradcorr.f90 @@ -1068,16 +1068,16 @@ subroutine pw_gpu (rs, ec, vc) end subroutine pw_gpu -SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d ) +SUBROUTINE gradcorr_gpu(rho_d, rhog_d, rho_core_d, rhog_core_d, etxc, vtxc, v_d ) !---------------------------------------------------------------------------- ! - USE constants, ONLY : e2 + USE constants, ONLY : e2, pi USE kinds, ONLY : DP - USE gvect, ONLY : nl, ngm, g + USE gvect, ONLY : nl, nl_d, ngm, g, g_d USE lsda_mod, ONLY : nspin USE cell_base, ONLY : omega, alat USE funct, ONLY : gcxc, gcx_spin, gcc_spin, igcc_is_lyp, & - gcc_spin_more, dft_is_gradient, get_igcc + gcc_spin_more, dft_is_gradient, get_igcc, get_igcx USE spin_orb, ONLY : domag USE noncollin_module, ONLY : ux USE wavefunctions_module, ONLY : psic @@ -1087,32 +1087,52 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co ! IMPLICIT NONE ! - REAL(DP), INTENT(IN) :: rho(dfftp%nnr,nspin), rho_core(dfftp%nnr) - COMPLEX(DP), INTENT(IN) :: rhog(ngm,nspin), rhog_core(ngm) - REAL(DP), INTENT(INOUT) :: v(dfftp%nnr,nspin) + REAL(DP), INTENT(IN), device :: rho_d(dfftp%nnr,nspin), rho_core_d(dfftp%nnr) + COMPLEX(DP), INTENT(IN), device :: rhog_d(ngm,nspin), rhog_core_d(ngm) + REAL(DP), INTENT(INOUT), device :: v_d(dfftp%nnr,nspin) REAL(DP), INTENT(INOUT) :: vtxc, etxc - ! - REAL(DP), DEVICE, INTENT(IN) :: rho_d(dfftp%nnr,nspin), rho_core_d(dfftp%nnr) - COMPLEX(DP), DEVICE, INTENT(IN) :: rhog_d(ngm,nspin), rhog_core_d(ngm) - REAL(DP), DEVICE, INTENT(INOUT) :: v_d(dfftp%nnr,nspin) + ! INTEGER :: k, ipol, is, nspin0, ir, jpol ! - REAL(DP), ALLOCATABLE :: grho(:,:,:), h(:,:,:), dh(:) - REAL(DP), ALLOCATABLE :: rhoout(:,:), segni(:), vgg(:,:), vsave(:,:) + REAL(DP), ALLOCATABLE :: segni(:), vgg(:,:), vsave(:,:) REAL(DP), ALLOCATABLE :: gmag(:,:,:) - COMPLEX(DP), ALLOCATABLE :: rhogsum(:,:) + REAL(DP), ALLOCATABLE, device :: grho_d(:,:,:), h_d(:,:,:), dh_d(:), rhoout_d(:,:) + + COMPLEX(DP), ALLOCATABLE, device :: rhogsum_d(:,:) ! - REAL(DP) :: grho2(2), sx, sc, v1x, v2x, v1c, v2c, & + REAL(DP) :: grho1, grho2(2), sx, sc, v1x, v2x, v1c, v2c, & v1xup, v1xdw, v2xup, v2xdw, v1cup, v1cdw , & etxcgc, vtxcgc, segno, arho, fac, zeta, rh, grh2, amag REAL(DP) :: v2cup, v2cdw, v2cud, rup, rdw, & grhoup, grhodw, grhoud, grup, grdw, seg ! REAL(DP), PARAMETER :: epsr = 1.D-6, epsg = 1.D-10 + integer :: i, j, igcx, igcc + + ! parameters from expanded gcxc subroutine + real(DP), parameter:: small = 1.E-10_DP + real(DP) :: kf, agrho, s1, s2, ds, dsg, exunif, fx + real(DP) :: dxunif, dfx, f1, f2, f3, dfx1 + real(DP) :: p, amu, ab, c, dfxdp, dfxds, upbe, uge, s, ak, aa + real(DP), parameter :: third = 1._DP / 3._DP, c1 = 0.75_DP / pi , & + c2 = 3.093667726280136_DP, c5 = 4._DP * third, & + c6 = c2*2.51984210, c7=5._DP/6._DP, c8=0.8_DP, & ! (3pi^2)^(1/3)*2^(4/3) + k1 = 0.804_DP, mu1 = 0.21951_DP ! + real(DP), parameter :: ga = 0.031091d0, be1 = 0.066725d0 + real(DP), parameter :: pi34 = 0.6203504908994d0 + real(DP), parameter :: xkf = 1.919158292677513d0, xks = 1.128379167095513d0 + real(DP) :: ks, rs, ec, vc, t, expe, af, bf, y, xy, qy + real(DP) :: h0, dh0, ddh0, sc2D, v1c2D, v2c2D + ! + real(DP) :: a, b1, b2, c0, d0, d1 + parameter (a = 0.031091d0, b1 = 7.5957d0, b2 = 3.5876d0, c0 = a) + real(DP) :: lnrs, rs12, rs32, rs2, om, dom, olog + real(DP), parameter :: a11 = 0.21370d0, b31 = 1.6382d0, b41 = 0.49294d0 + ! ! IF ( .NOT. dft_is_gradient() ) RETURN ! @@ -1126,199 +1146,333 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co if (nspin==4.and.domag) nspin0=2 fac = 1.D0 / DBLE( nspin0 ) ! - ALLOCATE( h( 3, dfftp%nnr, nspin0) ) - ALLOCATE( grho( 3, dfftp%nnr, nspin0) ) - ALLOCATE( rhoout( dfftp%nnr, nspin0) ) - IF (nspin==4.AND.domag) THEN - ALLOCATE( vgg( dfftp%nnr, nspin0 ) ) - ALLOCATE( vsave( dfftp%nnr, nspin ) ) - ALLOCATE( segni( dfftp%nnr ) ) - vsave=v - v=0.d0 - ENDIF + ALLOCATE( h_d( 3, dfftp%nnr, nspin0) ) + ALLOCATE( grho_d( 3, dfftp%nnr, nspin0) ) + ALLOCATE( rhoout_d( dfftp%nnr, nspin0) ) +! IF (nspin==4.AND.domag) THEN +! ALLOCATE( vgg( dfftp%nnr, nspin0 ) ) +! ALLOCATE( vsave( dfftp%nnr, nspin ) ) +! ALLOCATE( segni( dfftp%nnr ) ) +! vsave=v +! v=0.d0 +! ENDIF ! - ALLOCATE( rhogsum( ngm, nspin0 ) ) + ALLOCATE( rhogsum_d( ngm, nspin0 ) ) ! ! ... calculate the gradient of rho + rho_core in real space ! IF ( nspin == 4 .AND. domag ) THEN - ! - CALL compute_rho(rho,rhoout,segni,dfftp%nnr) - ! - ! ... bring starting rhoout to G-space - ! - DO is = 1, nspin0 - ! - psic(:) = rhoout(:,is) - ! - CALL fwfft ('Dense', psic, dfftp) - ! - rhogsum(:,is) = psic(nl(:)) - ! - END DO + print*, "gradcorr_gpu: nspin == 4 .and. domag not supported!"; flush(6); stop +! ! +! CALL compute_rho(rho,rhoout,segni,dfftp%nnr) +! ! +! ! ... bring starting rhoout to G-space +! ! +! DO is = 1, nspin0 +! ! +! psic(:) = rhoout(:,is) +! ! +! CALL fwfft ('Dense', psic, dfftp) +! ! +! rhogsum(:,is) = psic(nl(:)) +! ! +! END DO ELSE ! - rhoout(:,1:nspin0) = rho(:,1:nspin0) - rhogsum(:,1:nspin0) = rhog(:,1:nspin0) + !$cuf kernel do(2) <<<*,*>>> + do j = 1, nspin0 + do i = lbound(rhoout_d,1), ubound(rhoout_d,1) + rhoout_d(i,j) = rho_d(i,j) + end do + end do + + !$cuf kernel do(2) <<<*,*>>> + do j = 1, nspin0 + do i = lbound(rhogsum_d,1), ubound(rhogsum_d,1) + rhogsum_d(i,j) = rhog_d(i,j) + end do + end do ! ENDIF DO is = 1, nspin0 ! - rhoout(:,is) = fac * rho_core(:) + rhoout(:,is) - rhogsum(:,is) = fac * rhog_core(:) + rhogsum(:,is) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(rhoout_d,1), ubound(rhoout_d,1) + rhoout_d(i,is) = fac * rho_core_d(i) + rhoout_d(i,is) + end do + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(rhogsum_d,1), ubound(rhogsum_d,1) + rhogsum_d(i,is) = fac * rhog_core_d(i) + rhogsum_d(i,is) + end do ! call start_clock( 'gradrho' ) - CALL gradrho( dfftp%nnr, rhogsum(1,is), ngm, g, nl, grho(1,1,is) ) + CALL gradrho_gpu( dfftp%nnr, rhogsum_d(1,is), ngm, g_d, nl_d, grho_d(1,1,is) ) call stop_clock( 'gradrho' ) + ! END DO - ! - DEALLOCATE( rhogsum ) + DEALLOCATE( rhogsum_d ) ! IF ( nspin0 == 1 ) THEN - ! - ! ... This is the spin-unpolarised case - ! - DO k = 1, dfftp%nnr + igcx = get_igcx + igcc = get_igcc + + ! + IF (igcx .eq. 3 .and. igcc .eq. 4) THEN + ! + ! ... This is the spin-unpolarised case (on GPU, only for igcx == 3 and igcc == 4 config) + ! + !$cuf kernel do(1) <<<*, *>>> + DO k = 1, dfftp%nnr ! - arho = ABS( rhoout(k,1) ) + arho = ABS( rhoout_d(k,1) ) ! IF ( arho > epsr ) THEN ! - grho2(1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 + grho1 = grho_d(1,k,1)**2 + grho_d(2,k,1)**2 + grho_d(3,k,1)**2 ! - IF ( grho2(1) > epsg ) THEN + IF ( grho1 > epsg ) THEN ! - segno = SIGN( 1.D0, rhoout(k,1) ) + segno = SIGN( 1.D0, rhoout_d(k,1) ) ! - CALL gcxc_gpu( arho, grho2(1), sx, sc, v1x, v2x, v1c, v2c ) + !CALL gcxc_gpu( arho, grho1, sx, sc, v1x, v2x, v1c, v2c ) + if (arho <= small) then + sx = 0.0_DP + v1x = 0.0_DP + v2x = 0.0_DP + else + ! call pbex (rho, grho, 1, sx, v1x, v2x) + agrho = sqrt (grho1) + kf = c2 * arho**third + dsg = 0.5_DP / kf + s1 = agrho * dsg / arho + s2 = s1 * s1 + ds = - c5 * s1 + f1 = s2 * mu1 / k1 + f2 = 1._DP + f1 + f3 = k1 / f2 + fx = k1 - f3 + exunif = - c1 * kf + sx = exunif * fx + dxunif = exunif * third + dfx1 = f2 * f2 + dfx = 2._DP * mu1 * s1 / dfx1 + v1x = sx + dxunif * fx + exunif * dfx * ds + v2x = exunif * dfx * dsg / agrho + sx = sx * arho + endif + + if (arho.le.small) then + sc = 0.0_DP + v1c = 0.0_DP + v2c = 0.0_DP + else + !call pbec (rho, grho, 1, sc, v1c, v2c) + rs = pi34 / arho**third + ! + !call pw_gpu (rs, ec, vc) + rs12 = sqrt (rs) + rs32 = rs * rs12 + rs2 = rs**2 + om = 2.d0 * a * (b1 * rs12 + b2 * rs + b31 * rs32 + b41 * rs2) + dom = 2.d0 * a * (0.5d0 * b1 * rs12 + b2 * rs + 1.5d0 * b31 * rs32 + 2.d0 * b41 * rs2) + olog = log (1.d0 + 1.0d0 / om) + ec = - 2.d0 * a * (1.d0 + a11 * rs) * olog + vc = - 2.d0 * a * (1.d0 + 2.d0 / 3.d0 * a11 * rs) & + * olog - 2.d0 / 3.d0 * a * (1.d0 + a11 * rs) * dom / (om * (om + 1.d0) ) + ! + kf = xkf / rs + ks = xks * sqrt (kf) + t = sqrt (grho1) / (2.d0 * ks * arho) + expe = exp ( - ec / ga) + af = be1 / ga * (1.d0 / (expe-1.d0) ) + bf = expe * (vc - ec) + y = af * t * t + xy = (1.d0 + y) / (1.d0 + y + y * y) + qy = y * y * (2.d0 + y) / (1.d0 + y + y * y) **2 + s1 = 1.d0 + be1 / ga * t * t * xy + h0 = ga * log (s1) + dh0 = be1 * t * t / s1 * ( - 7.d0 / 3.d0 * xy - qy * (af * bf / & + be1-7.d0 / 3.d0) ) + ddh0 = be1 / (2.d0 * ks * ks * arho) * (xy - qy) / s1 + sc = arho * h0 + v1c = h0 + dh0 + v2c = ddh0 + endif ! ! ... first term of the gradient correction : D(rho*Exc)/D(rho) ! - v(k,1) = v(k,1) + e2 * ( v1x + v1c ) + v_d(k,1) = v_d(k,1) + e2 * ( v1x + v1c ) ! ! ... h contains : ! ! ... D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho| ! - h(:,k,1) = e2 * ( v2x + v2c ) * grho(:,k,1) + h_d(:,k,1) = e2 * ( v2x + v2c ) * grho_d(:,k,1) ! - vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout(k,1) - rho_core(k) ) + vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout_d(k,1) - rho_core_d(k) ) etxcgc = etxcgc+e2*( sx + sc ) * segno ! ELSE - h(:,k,1)=0.D0 + h_d(:,k,1)=0.D0 END IF ! - ELSE - ! - h(:,k,1) = 0.D0 - ! - END IF - ! - END DO - ! + ELSE + ! + h_d(:,k,1) = 0.D0 + ! + END IF + ! + END DO + + ELSE + print*, "gradcorr_gpu: should not be here!"; flush(6); stop +! ! +! ! ... This is the spin-unpolarised case +! ! +! DO k = 1, dfftp%nnr +! ! +! arho = ABS( rhoout(k,1) ) +! ! +! IF ( arho > epsr ) THEN +! ! +! grho2(1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 +! ! +! IF ( grho2(1) > epsg ) THEN +! ! +! segno = SIGN( 1.D0, rhoout(k,1) ) +! ! +! CALL gcxc( arho, grho2(1), sx, sc, v1x, v2x, v1c, v2c ) +! ! +! ! ... first term of the gradient correction : D(rho*Exc)/D(rho) +! ! +! v(k,1) = v(k,1) + e2 * ( v1x + v1c ) +! ! +! ! ... h contains : +! ! +! ! ... D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho| +! ! +! h(:,k,1) = e2 * ( v2x + v2c ) * grho(:,k,1) +! ! +! vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout(k,1) - rho_core(k) ) +! etxcgc = etxcgc+e2*( sx + sc ) * segno +! ! +! ELSE +! h(:,k,1)=0.D0 +! END IF +! ! +! ELSE +! ! +! h(:,k,1) = 0.D0 +! ! +! END IF +! ! +! END DO + ENDIF +! ! ELSE ! ! ... spin-polarised case ! -!$omp parallel do private( rh, grho2, sx, v1xup, v1xdw, v2xup, v2xdw, rup, rdw, & -!$omp grhoup, grhodw, grhoud, sc, v1cup, v1cdw, v2cup, v2cdw, v2cud, & -!$omp zeta, grh2, v2c, grup, grdw ), & -!$omp reduction(+:etxcgc,vtxcgc) - DO k = 1, dfftp%nnr - ! - rh = rhoout(k,1) + rhoout(k,2) - ! - grho2(:) = grho(1,k,:)**2 + grho(2,k,:)**2 + grho(3,k,:)**2 - ! - CALL gcx_spin( rhoout(k,1), rhoout(k,2), grho2(1), & - grho2(2), sx, v1xup, v1xdw, v2xup, v2xdw ) - ! - IF ( rh > epsr ) THEN - ! - IF ( igcc_is_lyp() ) THEN - ! - rup = rhoout(k,1) - rdw = rhoout(k,2) - ! - grhoup = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 - grhodw = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2 - ! - grhoud = grho(1,k,1) * grho(1,k,2) + & - grho(2,k,1) * grho(2,k,2) + & - grho(3,k,1) * grho(3,k,2) - ! - CALL gcc_spin_more( rup, rdw, grhoup, grhodw, grhoud, & - sc, v1cup, v1cdw, v2cup, v2cdw, v2cud ) - ! - ELSE - ! - zeta = ( rhoout(k,1) - rhoout(k,2) ) / rh - if (nspin.eq.4.and.domag) zeta=abs(zeta)*segni(k) - ! - grh2 = ( grho(1,k,1) + grho(1,k,2) )**2 + & - ( grho(2,k,1) + grho(2,k,2) )**2 + & - ( grho(3,k,1) + grho(3,k,2) )**2 - ! - CALL gcc_spin( rh, zeta, grh2, sc, v1cup, v1cdw, v2c ) - ! - v2cup = v2c - v2cdw = v2c - v2cud = v2c - ! - END IF - ! - ELSE - ! - sc = 0.D0 - v1cup = 0.D0 - v1cdw = 0.D0 - v2c = 0.D0 - v2cup = 0.D0 - v2cdw = 0.D0 - v2cud = 0.D0 - ! - ENDIF - ! - ! ... first term of the gradient correction : D(rho*Exc)/D(rho) - ! - v(k,1) = v(k,1) + e2 * ( v1xup + v1cup ) - v(k,2) = v(k,2) + e2 * ( v1xdw + v1cdw ) - ! - ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| - ! - DO ipol = 1, 3 - ! - grup = grho(ipol,k,1) - grdw = grho(ipol,k,2) - h(ipol,k,1) = e2 * ( ( v2xup + v2cup ) * grup + v2cud * grdw ) - h(ipol,k,2) = e2 * ( ( v2xdw + v2cdw ) * grdw + v2cud * grup ) - ! - END DO - ! - vtxcgc = vtxcgc + & - e2 * ( v1xup + v1cup ) * ( rhoout(k,1) - rho_core(k) * fac ) - vtxcgc = vtxcgc + & - e2 * ( v1xdw + v1cdw ) * ( rhoout(k,2) - rho_core(k) * fac ) - etxcgc = etxcgc + e2 * ( sx + sc ) - ! - END DO -!$omp end parallel do + print*, "gradcorr_gpu: SPIN POLARISED not supported!"; flush(6); stop +!!$omp parallel do private( rh, grho2, sx, v1xup, v1xdw, v2xup, v2xdw, rup, rdw, & +!!$omp grhoup, grhodw, grhoud, sc, v1cup, v1cdw, v2cup, v2cdw, v2cud, & +!!$omp zeta, grh2, v2c, grup, grdw ), & +!!$omp reduction(+:etxcgc,vtxcgc) +! DO k = 1, dfftp%nnr +! ! +! rh = rhoout(k,1) + rhoout(k,2) +! ! +! grho2(:) = grho(1,k,:)**2 + grho(2,k,:)**2 + grho(3,k,:)**2 +! ! +! CALL gcx_spin( rhoout(k,1), rhoout(k,2), grho2(1), & +! grho2(2), sx, v1xup, v1xdw, v2xup, v2xdw ) +! ! +! IF ( rh > epsr ) THEN +! ! +! IF ( igcc_is_lyp() ) THEN +! ! +! rup = rhoout(k,1) +! rdw = rhoout(k,2) +! ! +! grhoup = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 +! grhodw = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2 +! ! +! grhoud = grho(1,k,1) * grho(1,k,2) + & +! grho(2,k,1) * grho(2,k,2) + & +! grho(3,k,1) * grho(3,k,2) +! ! +! CALL gcc_spin_more( rup, rdw, grhoup, grhodw, grhoud, & +! sc, v1cup, v1cdw, v2cup, v2cdw, v2cud ) +! ! +! ELSE +! ! +! zeta = ( rhoout(k,1) - rhoout(k,2) ) / rh +! if (nspin.eq.4.and.domag) zeta=abs(zeta)*segni(k) +! ! +! grh2 = ( grho(1,k,1) + grho(1,k,2) )**2 + & +! ( grho(2,k,1) + grho(2,k,2) )**2 + & +! ( grho(3,k,1) + grho(3,k,2) )**2 +! ! +! CALL gcc_spin( rh, zeta, grh2, sc, v1cup, v1cdw, v2c ) +! ! +! v2cup = v2c +! v2cdw = v2c +! v2cud = v2c +! ! +! END IF +! ! +! ELSE +! ! +! sc = 0.D0 +! v1cup = 0.D0 +! v1cdw = 0.D0 +! v2c = 0.D0 +! v2cup = 0.D0 +! v2cdw = 0.D0 +! v2cud = 0.D0 +! ! +! ENDIF +! ! +! ! ... first term of the gradient correction : D(rho*Exc)/D(rho) +! ! +! v(k,1) = v(k,1) + e2 * ( v1xup + v1cup ) +! v(k,2) = v(k,2) + e2 * ( v1xdw + v1cdw ) +! ! +! ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| +! ! +! DO ipol = 1, 3 +! ! +! grup = grho(ipol,k,1) +! grdw = grho(ipol,k,2) +! h(ipol,k,1) = e2 * ( ( v2xup + v2cup ) * grup + v2cud * grdw ) +! h(ipol,k,2) = e2 * ( ( v2xdw + v2cdw ) * grdw + v2cud * grup ) +! ! +! END DO +! ! +! vtxcgc = vtxcgc + & +! e2 * ( v1xup + v1cup ) * ( rhoout(k,1) - rho_core(k) * fac ) +! vtxcgc = vtxcgc + & +! e2 * ( v1xdw + v1cdw ) * ( rhoout(k,2) - rho_core(k) * fac ) +! etxcgc = etxcgc + e2 * ( sx + sc ) +! ! +! END DO +!!$omp end parallel do ! END IF ! - DO is = 1, nspin0 - ! - rhoout(:,is) = rhoout(:,is) - fac * rho_core(:) - ! - END DO + !$cuf kernel do(2) + do is = 1, nspin0 + do i = lbound(rhoout_d, 1), ubound(rhoout_d,1) + rhoout_d(i,is) = rhoout_d(i,is) - fac * rho_core_d(i) + end do + end do ! - DEALLOCATE( grho ) + DEALLOCATE( grho_d ) ! - ALLOCATE( dh( dfftp%nnr ) ) + ALLOCATE( dh_d( dfftp%nnr ) ) ! ! ... second term of the gradient correction : ! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) ) @@ -1326,44 +1480,50 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co DO is = 1, nspin0 ! call start_clock( 'graddot' ) - CALL grad_dot( dfftp%nnr, h(1,1,is), ngm, g, nl, alat, dh ) + CALL grad_dot_gpu( dfftp%nnr, h_d(1,1,is), ngm, g_d, nl_d, alat, dh_d ) call stop_clock( 'graddot' ) ! - v(:,is) = v(:,is) - dh(:) + !$cuf kernel do(1) + do i = lbound(v_d, 1), ubound(v_d,1) + v_d(i,is) = v_d(i,is) - dh_d(i) + end do ! - vtxcgc = vtxcgc - SUM( dh(:) * rhoout(:,is) ) + !$cuf kernel do(1) + do i = lbound(dh_d, 1), ubound(dh_d,1) + vtxcgc = vtxcgc - dh_d(i) * rhoout_d(i,is) + end do ! END DO + ! vtxc = vtxc + omega * vtxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 ) etxc = etxc + omega * etxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 ) - IF (nspin==4.AND.domag) THEN - DO is=1,nspin0 - vgg(:,is)=v(:,is) - ENDDO - v=vsave - DO k=1,dfftp%nnr - v(k,1)=v(k,1)+0.5d0*(vgg(k,1)+vgg(k,2)) - amag=sqrt(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2) - IF (amag.GT.1.d-12) THEN - v(k,2)=v(k,2)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,2)/amag - v(k,3)=v(k,3)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,3)/amag - v(k,4)=v(k,4)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag - ENDIF - ENDDO - ENDIF - ! - DEALLOCATE( dh ) - DEALLOCATE( h ) - DEALLOCATE( rhoout ) +! IF (nspin==4.AND.domag) THEN +! DO is=1,nspin0 +! vgg(:,is)=v(:,is) +! ENDDO +! v=vsave +! DO k=1,dfftp%nnr +! v(k,1)=v(k,1)+0.5d0*(vgg(k,1)+vgg(k,2)) +! amag=sqrt(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2) +! IF (amag.GT.1.d-12) THEN +! v(k,2)=v(k,2)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,2)/amag +! v(k,3)=v(k,3)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,3)/amag +! v(k,4)=v(k,4)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag +! ENDIF +! ENDDO +! ENDIF + ! + DEALLOCATE( dh_d ) + DEALLOCATE( h_d ) + DEALLOCATE( rhoout_d ) IF (nspin==4.and.domag) THEN DEALLOCATE( vgg ) DEALLOCATE( vsave ) DEALLOCATE( segni ) ENDIF ! - call stop_clock( 'gradcorr' ) RETURN ! @@ -1387,16 +1547,17 @@ SUBROUTINE gradrho_gpu( nrxx, a, ngm, g, nl, ga ) IMPLICIT NONE ! INTEGER, INTENT(IN) :: nrxx - INTEGER, INTENT(IN) :: ngm, nl(ngm) - COMPLEX(DP), INTENT(IN) :: a(ngm) - REAL(DP), INTENT(IN) :: g(3,ngm) - REAL(DP), INTENT(OUT) :: ga(3,nrxx) + INTEGER, INTENT(IN) :: ngm + INTEGER, INTENT(IN), device :: nl(ngm) + COMPLEX(DP), INTENT(IN), device :: a(ngm) + REAL(DP), INTENT(IN), device :: g(3,ngm) + REAL(DP), INTENT(OUT), device :: ga(3,nrxx) ! - INTEGER :: ipol - COMPLEX(DP), ALLOCATABLE :: gaux(:) + INTEGER :: i, ipol + COMPLEX(DP), ALLOCATABLE, device :: gaux_d(:) ! ! - ALLOCATE( gaux( nrxx ) ) + ALLOCATE( gaux_d (nrxx) ) ! ! ... multiply by (iG) to get (\grad_ipol a)(G) ... ! @@ -1404,27 +1565,35 @@ SUBROUTINE gradrho_gpu( nrxx, a, ngm, g, nl, ga ) ! DO ipol = 1, 3 ! - gaux(:) = CMPLX(0.d0,0.d0,kind=dp) + gaux_d(:) = CMPLX(0.d0,0.d0,kind=dp) ! - gaux(nl(:)) = g(ipol,:) * CMPLX( -AIMAG( a(:) ), REAL( a(:) ) ,kind=DP) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(a,1), ubound(a,1) + gaux_d(nl(i)) = g(ipol,i) * CMPLX( -AIMAG( a(i) ), REAL( a(i) ) ,kind=DP) + end do ! IF ( gamma_only ) THEN + print*, "gradrho_gpu: GAMMA_ONLY not implemented!" + flush(6); stop ! - gaux(nlm(:)) = CMPLX( REAL( gaux(nl(:)) ), -AIMAG( gaux(nl(:)) ) ,kind=DP) + !gaux(nlm(:)) = CMPLX( REAL( gaux(nl(:)) ), -AIMAG( gaux(nl(:)) ) ,kind=DP) ! END IF ! ! ... bring back to R-space, (\grad_ipol a)(r) ... ! - CALL invfft ('Dense', gaux, dfftp) + CALL invfft ('Dense', gaux_d, dfftp) ! ! ...and add the factor 2\pi/a missing in the definition of G ! - ga(ipol,:) = ga(ipol,:) + tpiba * REAL( gaux(:) ) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(ga,2), ubound(ga,2) + ga(ipol,i) = ga(ipol,i) + tpiba * REAL( gaux_d(i) ) + end do ! END DO ! - DEALLOCATE( gaux ) + DEALLOCATE( gaux_d ) ! RETURN ! @@ -1512,54 +1681,65 @@ SUBROUTINE grad_dot_gpu( nrxx, a, ngm, g, nl, alat, da ) ! IMPLICIT NONE ! - INTEGER, INTENT(IN) :: nrxx, ngm, nl(ngm) - REAL(DP), INTENT(IN) :: a(3,nrxx), g(3,ngm), alat - REAL(DP), INTENT(OUT) :: da(nrxx) + INTEGER, INTENT(IN) :: nrxx, ngm + INTEGER, INTENT(IN), device :: nl(ngm) + REAL(DP), INTENT(IN) :: alat + REAL(DP), INTENT(IN), device :: a(3,nrxx), g(3,ngm) + REAL(DP), INTENT(OUT), device :: da(nrxx) ! - INTEGER :: n, ipol - COMPLEX(DP), ALLOCATABLE :: aux(:), gaux(:) + INTEGER :: n, ipol, i + COMPLEX(DP), ALLOCATABLE, device :: aux_d(:), gaux_d(:) ! ! - ALLOCATE( aux( nrxx ), gaux( nrxx ) ) + ALLOCATE( aux_d( nrxx ), gaux_d( nrxx ) ) ! - gaux(:) = CMPLX(0.d0,0.d0, kind=dp) + gaux_d(:) = CMPLX(0.d0,0.d0, kind=dp) ! DO ipol = 1, 3 ! - aux = CMPLX( a(ipol,:), 0.D0 ,kind=DP) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(a,2), ubound(a,2) + aux_d(i) = CMPLX( a(ipol,i), 0.D0 ,kind=DP) + end do ! ! ... bring a(ipol,r) to G-space, a(G) ... ! - CALL fwfft ('Dense', aux, dfftp) + CALL fwfft ('Dense', aux_d, dfftp) ! + !$cuf kernel do(1) <<<*,*>>> DO n = 1, ngm ! - gaux(nl(n)) = gaux(nl(n)) + g(ipol,n) * & - CMPLX( -AIMAG( aux(nl(n)) ), REAL( aux(nl(n)) ) ,kind=DP) + gaux_d(nl(n)) = gaux_d(nl(n)) + g(ipol,n) * & + CMPLX( -AIMAG( aux_d(nl(n)) ), REAL( aux_d(nl(n)) ) ,kind=DP) ! END DO ! END DO ! IF ( gamma_only ) THEN + print*, "grad_dot_gpu: GAMMA_ONLY not implemented!!" + flush(6); stop ! - DO n = 1, ngm - ! - gaux(nlm(n)) = CONJG( gaux(nl(n)) ) - ! - END DO + !DO n = 1, ngm + ! ! + ! gaux(nlm(n)) = CONJG( gaux(nl(n)) ) + ! ! + !END DO ! END IF ! ! ... bring back to R-space, (\grad_ipol a)(r) ... ! - CALL invfft ('Dense', gaux, dfftp) + CALL invfft ('Dense', gaux_d, dfftp) ! ! ... add the factor 2\pi/a missing in the definition of G and sum ! - da(:) = tpiba * REAL( gaux(:) ) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(da,1), ubound(da,1) + da(i) = tpiba * REAL( gaux_d(i) ) + end do ! - DEALLOCATE( aux, gaux ) + DEALLOCATE( aux_d, gaux_d ) ! RETURN ! diff --git a/PW/src/scf_mod.f90 b/PW/src/scf_mod.f90 index 296bf62dd..9b1f057d2 100644 --- a/PW/src/scf_mod.f90 +++ b/PW/src/scf_mod.f90 @@ -66,7 +66,7 @@ MODULE scf REAL(DP) :: el_dipole ! electrons dipole END TYPE mix_type - type (scf_type) :: rho ! the charge density and its other components + type (scf_type), TARGET :: rho ! the charge density and its other components type (scf_type) :: v ! the scf potential type (scf_type) :: vnew ! used to correct the forces @@ -83,7 +83,7 @@ MODULE scf rho_core_d(:), &! the core charge in real space kedtau_d(:,:) ! position dependent kinetic energy enhancement factor - COMPLEX(DP), ALLOCATABLE :: & + COMPLEX(DP), DEVICE, ALLOCATABLE :: & rhog_core_d(:) ! the core charge in reciprocal space #endif @@ -220,7 +220,11 @@ subroutine assign_scf_to_mix_type(rho_s, rho_m) end subroutine assign_scf_to_mix_type ! subroutine assign_mix_to_scf_type(rho_m, rho_s) +#ifdef USE_CUDA + USE wavefunctions_module, ONLY : psic => psic_d +#else USE wavefunctions_module, ONLY : psic +#endif USE control_flags, ONLY : gamma_only USE gvect, ONLY : nl, nlm IMPLICIT NONE @@ -337,7 +341,11 @@ subroutine mix_type_SCAL (A,X) end subroutine mix_type_SCAL ! subroutine high_frequency_mixing ( rhoin, input_rhout, alphamix ) +#ifdef USE_CUDA + USE wavefunctions_module, ONLY : psic => psic_d +#else USE wavefunctions_module, ONLY : psic +#endif USE control_flags, ONLY : gamma_only USE gvect, ONLY : nl, nlm IMPLICIT NONE @@ -383,7 +391,6 @@ subroutine high_frequency_mixing ( rhoin, input_rhout, alphamix ) return end subroutine high_frequency_mixing - subroutine open_mix_file( iunit, extension, exst ) USE control_flags, ONLY : io_level implicit none diff --git a/PW/src/set_vrs.f90 b/PW/src/set_vrs.f90 index afc19c347..b6341f13a 100644 --- a/PW/src/set_vrs.f90 +++ b/PW/src/set_vrs.f90 @@ -115,3 +115,84 @@ subroutine interpolate_vrs ( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs ) return end subroutine interpolate_vrs + +#ifdef USE_CUDA +!-------------------------------------------------------------------- +subroutine sum_vrs_gpu ( nrxx, nspin, vltot, vr, vrs ) + !-------------------------------------------------------------------- + ! accumulates local potential contributions in to vrs + ! + USE kinds + ! + implicit none + + integer :: nspin, nrxx + ! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear + ! input: the fft grid dimension + real(DP), device :: vrs (nrxx, nspin), vltot (nrxx), vr (nrxx, nspin) + ! output: total local potential on the smooth grid + ! vrs=vltot+vr + ! input: the total local pseudopotential + ! input: the scf(H+xc) part of the local potential + + integer:: is, i + + do is = 1, nspin + ! + ! define the total local potential (external + scf) for each spin ... + ! + if (is > 1 .and. nspin == 4) then + ! + ! noncolinear case: only the first component contains vltot + ! + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(vrs,1), ubound(vrs, 1) + vrs (i, is) = vr (i, is) + end do + else + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(vrs,1), ubound(vrs, 1) + vrs (i, is) = vltot (i) + vr (i, is) + end do + end if + ! + enddo + return + +end subroutine sum_vrs_gpu +! +!-------------------------------------------------------------------- +subroutine interpolate_vrs_gpu ( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs ) + !-------------------------------------------------------------------- + ! set the total local potential vrs on the smooth mesh to be used in + ! h_psi, adding the (spin dependent) scf (H+xc) part and the sum of + ! all the local pseudopotential contributions. + ! + USE kinds + USE funct, only : dft_is_meta + USE fft_base, only : dffts + implicit none + + integer :: nspin, nrxx + ! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear + ! input: the fft grid dimension + real(DP) :: kedtau(dffts%nnr,nspin), kedtaur(nrxx,nspin) + real(DP), device :: vrs (nrxx, nspin) + ! output: total local potential interpolated on the smooth grid + ! input: the scf(H+xc) part of the local potential + logical :: doublegrid + ! input: true if a doublegrid is used + + integer:: is + + do is = 1, nspin + ! + ! ... and interpolate it on the smooth mesh if necessary + ! + if (doublegrid) call interpolate_gpu (vrs (1, is), vrs (1, is), - 1) + if (dft_is_meta()) call interpolate(kedtaur(1,is),kedtau(1,is),-1) + enddo + return + +end subroutine interpolate_vrs_gpu +#endif diff --git a/PW/src/sum_band.f90 b/PW/src/sum_band.f90 index 33ae7d8e3..fc44776f9 100644 --- a/PW/src/sum_band.f90 +++ b/PW/src/sum_band.f90 @@ -1280,6 +1280,7 @@ SUBROUTINE sum_band_gpu() ik, &! counter on k points ibnd_start, ibnd_end, this_bgrp_nbnd ! first, last and number of band in this bgrp REAL (DP), ALLOCATABLE :: kplusg (:) + integer :: i ! ! CALL start_clock( 'sum_band' ) @@ -1405,10 +1406,23 @@ SUBROUTINE sum_band_gpu() ! ! ... bring the (unsymmetrized) rho(r) to G-space (use psic as work array) ! + !DO is = 1, nspin + ! psic(:) = rho%of_r(:,is) + ! CALL fwfft ('Dense', psic, dfftp) + ! rho%of_g(:,is) = psic(nl(:)) + !END DO + + !rho%of_r_d(:,:) = rho%of_r(:,:) + DO is = 1, nspin - psic(:) = rho%of_r(:,is) - CALL fwfft ('Dense', psic, dfftp) - rho%of_g(:,is) = psic(nl(:)) + psic_d(:) = rho%of_r(:,is) !JR: This memcopy is very slow + !!$cuf kernel do(1) <<<*, *>>> + !do i = lbound(psic_d,1), ubound(psic_d,1) + ! psic_d(i) = rho%of_r_d(i,is) + !end do + + CALL fwfft ('Dense', psic_d, dfftp) + rho%of_g(:,is) = psic_d(nl(:)) END DO ! ! ... symmetrize rho(G) @@ -1430,11 +1444,11 @@ SUBROUTINE sum_band_gpu() ! DO is = 1, nspin_mag ! - psic(:) = ( 0.D0, 0.D0 ) - psic(nl(:)) = rho%of_g(:,is) - IF ( gamma_only ) psic(nlm(:)) = CONJG( rho%of_g(:,is) ) - CALL invfft ('Dense', psic, dfftp) - rho%of_r(:,is) = psic(:) + psic_d(:) = ( 0.D0, 0.D0 ) + psic_d(nl(:)) = rho%of_g(:,is) + IF ( gamma_only ) psic_d(nlm(:)) = CONJG( rho%of_g(:,is) ) + CALL invfft ('Dense', psic_d, dfftp) + rho%of_r(:,is) = psic_d(:) ! END DO ! diff --git a/PW/src/v_of_rho.f90 b/PW/src/v_of_rho.f90 index 314ac2e62..b8c2cb213 100644 --- a/PW/src/v_of_rho.f90 +++ b/PW/src/v_of_rho.f90 @@ -513,7 +513,7 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) ! ! ... add gradient corrections (if any) ! - CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v ) + CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v ) ! ! ... add non local corrections (if any) ! @@ -1198,10 +1198,8 @@ SUBROUTINE gradv_h_of_rho_r( rho, gradv ) END SUBROUTINE gradv_h_of_rho_r #ifdef USE_CUDA -!#if 0 - -SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, & - ehart, etxc, vtxc, eth, etotefield, charge, v ) +SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, & + rhog_core_d, ehart, etxc, vtxc, eth, etotefield, charge, v ) !---------------------------------------------------------------------------- ! ! ... This routine computes the Hartree and Exchange and Correlation @@ -1229,15 +1227,11 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, & !!!!!!!!!!!!!!!!! not just OUT because otherwise their allocatable or pointer !!!!!!!!!!!!!!!!! components are NOT defined !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! REAL(DP), INTENT(IN) :: rho_core(dfftp%nnr) + REAL(DP), INTENT(IN), device :: rho_core_d(dfftp%nnr) ! the core charge COMPLEX(DP), INTENT(IN) :: rhog_core(ngm) + COMPLEX(DP), INTENT(IN), device :: rhog_core_d(ngm) ! the core charge in reciprocal space - - REAL(DP), device, INTENT(IN) :: rho_core_d(dfftp%nnr) - ! the core charge - COMPLEX(DP), device, INTENT(IN) :: rhog_core_d(ngm) - ! the core charge in reciprocal space - REAL(DP), INTENT(OUT) :: vtxc, etxc, ehart, eth, charge ! the integral V_xc * rho ! the E_xc energy @@ -1254,20 +1248,30 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, & ! ... calculate exchange-correlation potential ! if (dft_is_meta()) then - print *,"dft_is_meta NOT implemented!!!!" - STOP - !call v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r ) + print*, "v_of_rho_gpu: meta not implemented!"; flush(6); stop + call v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r ) else - CALL v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v%of_r, v%of_r_d ) + CALL v_xc_gpu( rho, rho_core_d, rhog_core_d, etxc, vtxc, v%of_r_d ) endif ! ! ... add a magnetic field (if any) ! + !JR Need to uncomment memcopies if bfield present + !v%of_r = v%of_r_d CALL add_bfield( v%of_r, rho%of_r ) + !v%of_r_d = v%of_r + ! ! ... calculate hartree potential ! - CALL v_h( rho%of_g, ehart, charge, v%of_r ) + !JR Note: Using v_h_gpu does impact last printed digits of total and HF estimate energy results. + CALL v_h_gpu(rho%of_g_d, ehart, charge, v%of_r_d ) + v%of_r = v%of_r_d + + !JR Replace with CPU call here if desired + !v%of_r = v%of_r_d + !CALL v_h(rho%of_g, ehart, charge, v%of_r ) + ! ! ... LDA+U: build up Hubbard potential ! @@ -1301,6 +1305,7 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, & RETURN ! END SUBROUTINE v_of_rho_gpu +! !---------------------------------------------------------------------------- SUBROUTINE v_xc_meta_gpu( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) !---------------------------------------------------------------------------- @@ -1526,8 +1531,8 @@ SUBROUTINE v_xc_meta_gpu( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) RETURN ! END SUBROUTINE v_xc_meta_gpu -! -SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d ) + +SUBROUTINE v_xc_gpu( rho, rho_core_d, rhog_core_d, etxc, vtxc, v_d ) !---------------------------------------------------------------------------- ! ! ... Exchange-Correlation potential Vxc(r) from n(r) @@ -1540,39 +1545,28 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt USE lsda_mod, ONLY : nspin USE cell_base, ONLY : omega USE spin_orb, ONLY : domag - USE funct, ONLY : xc, xc_spin, nlc, dft_is_nonlocc + USE funct, ONLY : xc, xc_spin, nlc, dft_is_nonlocc, get_iexch, get_icorr USE scf, ONLY : scf_type USE mp_bands, ONLY : intra_bgrp_comm USE mp, ONLY : mp_sum - USE cudafor - ! IMPLICIT NONE ! - TYPE (scf_type), INTENT(INOUT) :: rho - REAL(DP), INTENT(IN) :: rho_core(dfftp%nnr) - ! the core charge - COMPLEX(DP), INTENT(IN) :: rhog_core(ngm) - ! input: the core charge in reciprocal space - REAL(DP), INTENT(OUT) :: v(dfftp%nnr,nspin), vtxc, etxc - ! V_xc potential - ! integral V_xc * rho - ! E_xc energy - - REAL(DP), DEVICE, INTENT(IN) :: rho_core_d(dfftp%nnr) + TYPE (scf_type), INTENT(INOUT), target :: rho + REAL(DP), INTENT(INOUT), device :: rho_core_d(dfftp%nnr) ! the core charge - COMPLEX(DP), DEVICE, INTENT(IN) :: rhog_core_d(ngm) + COMPLEX(DP), INTENT(IN), device :: rhog_core_d(ngm) ! input: the core charge in reciprocal space - REAL(DP), DEVICE, INTENT(OUT) :: v_d(dfftp%nnr,nspin) + REAL(DP), INTENT(OUT) :: vtxc, etxc + REAL(DP), INTENT(OUT), device :: v_d(dfftp%nnr,nspin) ! V_xc potential ! integral V_xc * rho ! E_xc energy - ! ! ... local variables ! - REAL(DP) :: rhox, arhox, zeta, amag, vs, ex, ec, vx(2), vc(2), rhoneg(2) + REAL(DP) :: rhox, arhox, zeta, amag, vs, ex, ec, vx(2), vc(2), rhoneg(2), vx1, vc1, rhoneg1 ! the total charge in each point ! the absolute value of the charge ! the absolute value of the charge @@ -1586,160 +1580,218 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt ! REAL(DP), PARAMETER :: vanishing_charge = 1.D-10, & vanishing_mag = 1.D-20 + + integer :: iexch, icorr + real(DP), pointer, device :: rho_of_r_d(:,:) + + ! parameters from expanded subroutines from xc + real(DP), parameter :: small = 1.E-10_DP, third = 1.0_DP / 3.0_DP, & + pi34 = 0.6203504908994_DP ! pi34=(3/4pi)^(1/3) + real(DP) :: rs + real(dp), parameter :: f= -0.687247939924714d0, alpha = 2.0d0/3.0d0 + ! f = -9/8*(3/2pi)^(2/3) + real(DP) :: a, b1, b2, c0, c1, c2, c3, d0, d1 + parameter (a = 0.031091d0, b1 = 7.5957d0, b2 = 3.5876d0, c0 = a, & + c1 = 0.046644d0, c2 = 0.00664d0, c3 = 0.01043d0, d0 = 0.4335d0, & + d1 = 1.4408d0) + real(DP) :: lnrs, rs12, rs32, rs2, om, dom, olog + real(DP) :: a11, b31, b41 + parameter (a11 = 0.21370d0, b31 = 1.6382d0, b41 = 0.49294d0) ! ! CALL start_clock( 'v_xc' ) ! etxc = 0.D0 vtxc = 0.D0 - - v(:,:) = 0.D0 - !v_d(:,:) = 0.D0 - + v_d(:,:) = 0.D0 rhoneg = 0.D0 ! - - ! + CALL start_clock( 'spin' ) IF ( nspin == 1 .OR. ( nspin == 4 .AND. .NOT. domag ) ) THEN - ! - ! ... spin-unpolarized case - ! -!$omp parallel do private( rhox, arhox, ex, ec, vx, vc ), & -!$omp reduction(+:etxc,vtxc), reduction(-:rhoneg) - DO ir = 1, dfftp%nnr - ! - rhox = rho%of_r(ir,1) + rho_core(ir) - ! - arhox = ABS( rhox ) - ! - IF ( arhox > vanishing_charge ) THEN - ! - CALL xc( arhox, ex, ec, vx(1), vc(1) ) - ! - v(ir,1) = e2*( vx(1) + vc(1) ) - ! - etxc = etxc + e2*( ex + ec ) * rhox - ! - vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) - ! - ENDIF - ! - IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) - ! - END DO -!$omp end parallel do + iexch = get_iexch + icorr = get_icorr -#if 0 -!$cuf kernel do(1) <<<*,*>>> - DO ir = 1, dfftp%nnr - ! - rhox = rho%of_r_d(ir,1) + rho_core_d(ir) - ! - arhox = ABS( rhox ) - ! - IF ( arhox > vanishing_charge ) THEN - ! - CALL xc( arhox, ex, ec, vx(1), vc(1) ) - ! - v(ir,1) = e2*( vx(1) + vc(1) ) - ! - etxc = etxc + e2*( ex + ec ) * rhox - ! - vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) - ! - ENDIF - ! - IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) - ! - END DO -#endif + IF (iexch .eq. 1 .and. icorr .eq. 4) THEN + + rho_of_r_d => rho%of_r_d + + ! + ! ... spin-unpolarized case + ! + + !$cuf kernel do (1) <<<*,*>>> + DO ir = 1, dfftp%nnr + ! + rhox = rho_of_r_d(ir,1) + rho_core_d(ir) + ! + arhox = ABS( rhox ) + ! + IF ( arhox > vanishing_charge ) THEN + if (arhox <= small) then + ec = 0.0_DP + vc1 = 0.0_DP + ex = 0.0_DP + vx1 = 0.0_DP + else + rs = pi34 / arhox**third + ! + !CALL xc( arhox, ex, ec, vx(1), vc(1) ) + !call slater (rs, ex, vx(1)) + ex = f * alpha / rs + vx1 = 4.d0 / 3.d0 * f * alpha / rs + + !call pw (rs, 1, ec, vc(1)) + ! interpolation formula + rs12 = sqrt (rs) + rs32 = rs * rs12 + rs2 = rs**2 + om = 2.d0 * a * (b1 * rs12 + b2 * rs + b31 * rs32 + b41 & + * rs2) + dom = 2.d0 * a * (0.5d0 * b1 * rs12 + b2 * rs + 1.5d0 * b31 & + * rs32 + 2.d0 * b41 * rs2) + olog = log (1.d0 + 1.0d0 / om) + ec = - 2.d0 * a * (1.d0 + a11 * rs) * olog + vc1 = - 2.d0 * a * (1.d0 + 2.d0 / 3.d0 * a11 * rs) & + * olog - 2.d0 / 3.d0 * a * (1.d0 + a11 * rs) * dom / & + (om * (om + 1.d0) ) + endif + ! + v_d(ir,1) = e2*( vx1 + vc1 ) + ! + etxc = etxc + e2*( ex + ec ) * rhox + ! + vtxc = vtxc + v_d(ir,1) * rho_of_r_d(ir,1) + ! + ENDIF + ! + IF ( rho_of_r_d(ir,1) < 0.D0 ) rhoneg1 = rhoneg1 - rho_of_r_d(ir,1) + ! + END DO + + rhoneg(1) = rhoneg1 ! copy reduced variable + ELSE + print*, "v_xc_gpu: Should not be here!"; flush(6); stop + +! ! +! ! ... spin-unpolarized case +! ! +!!$omp parallel do private( rhox, arhox, ex, ec, vx, vc ), & +!!$omp reduction(+:etxc,vtxc), reduction(-:rhoneg) +! DO ir = 1, dfftp%nnr +! ! +! rhox = rho%of_r(ir,1) + rho_core(ir) +! ! +! arhox = ABS( rhox ) +! ! +! IF ( arhox > vanishing_charge ) THEN +! ! +! CALL xc( arhox, ex, ec, vx(1), vc(1) ) +! ! +! v(ir,1) = e2*( vx(1) + vc(1) ) +! ! +! etxc = etxc + e2*( ex + ec ) * rhox +! ! +! vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) +! ! +! ENDIF +! ! +! IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) +! ! +! END DO +!!$omp end parallel do + END IF ! ELSE IF ( nspin == 2 ) THEN ! ! ... spin-polarized case ! -!$omp parallel do private( rhox, arhox, zeta, ex, ec, vx, vc ), & -!$omp reduction(+:etxc,vtxc), reduction(-:rhoneg) - DO ir = 1, dfftp%nnr - ! - rhox = rho%of_r(ir,1) + rho%of_r(ir,2) + rho_core(ir) - ! - arhox = ABS( rhox ) - ! - IF ( arhox > vanishing_charge ) THEN - ! - zeta = ( rho%of_r(ir,1) - rho%of_r(ir,2) ) / arhox - ! - IF ( ABS( zeta ) > 1.D0 ) zeta = SIGN( 1.D0, zeta ) - ! - IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) - IF ( rho%of_r(ir,2) < 0.D0 ) rhoneg(2) = rhoneg(2) - rho%of_r(ir,2) - ! - CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) ) - ! - v(ir,:) = e2*( vx(:) + vc(:) ) - ! - etxc = etxc + e2*( ex + ec ) * rhox - ! - vtxc = vtxc + ( v(ir,1)*rho%of_r(ir,1) + v(ir,2)*rho%of_r(ir,2) ) - ! - END IF - ! - END DO -!$omp end parallel do + print*, "v_xc_gpu: SPIN POLOARIZED not supported!"; flush(6); STOP + +!!$omp parallel do private( rhox, arhox, zeta, ex, ec, vx, vc ), & +!!$omp reduction(+:etxc,vtxc), reduction(-:rhoneg) +! DO ir = 1, dfftp%nnr +! ! +! rhox = rho%of_r(ir,1) + rho%of_r(ir,2) + rho_core(ir) +! ! +! arhox = ABS( rhox ) +! ! +! IF ( arhox > vanishing_charge ) THEN +! ! +! zeta = ( rho%of_r(ir,1) - rho%of_r(ir,2) ) / arhox +! ! +! IF ( ABS( zeta ) > 1.D0 ) zeta = SIGN( 1.D0, zeta ) +! ! +! IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) +! IF ( rho%of_r(ir,2) < 0.D0 ) rhoneg(2) = rhoneg(2) - rho%of_r(ir,2) +! ! +! CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) ) +! ! +! v(ir,:) = e2*( vx(:) + vc(:) ) +! ! +! etxc = etxc + e2*( ex + ec ) * rhox +! ! +! vtxc = vtxc + ( v(ir,1)*rho%of_r(ir,1) + v(ir,2)*rho%of_r(ir,2) ) +! ! +! END IF +! ! +! END DO +!!$omp end parallel do ! ELSE IF ( nspin == 4 ) THEN ! ! ... noncolinear case ! - DO ir = 1,dfftp%nnr - ! - amag = SQRT( rho%of_r(ir,2)**2 + rho%of_r(ir,3)**2 + rho%of_r(ir,4)**2 ) - ! - rhox = rho%of_r(ir,1) + rho_core(ir) - ! - IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) - ! - arhox = ABS( rhox ) - ! - IF ( arhox > vanishing_charge ) THEN - ! - zeta = amag / arhox - ! - IF ( ABS( zeta ) > 1.D0 ) THEN - ! - rhoneg(2) = rhoneg(2) + 1.D0 / omega - ! - zeta = SIGN( 1.D0, zeta ) - ! - END IF - ! - CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) ) - ! - vs = 0.5D0*( vx(1) + vc(1) - vx(2) - vc(2) ) - ! - v(ir,1) = e2*( 0.5D0*( vx(1) + vc(1) + vx(2) + vc(2 ) ) ) - ! - IF ( amag > vanishing_mag ) THEN - ! - DO ipol = 2, 4 - ! - v(ir,ipol) = e2 * vs * rho%of_r(ir,ipol) / amag - ! - vtxc = vtxc + v(ir,ipol) * rho%of_r(ir,ipol) - ! - END DO - ! - END IF - ! - etxc = etxc + e2*( ex + ec ) * rhox - vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) - ! - END IF - ! - END DO + print*, "v_xc_gpu: NONCOLINEAR not supported!"; flush(6); STOP + +! DO ir = 1,dfftp%nnr +! ! +! amag = SQRT( rho%of_r(ir,2)**2 + rho%of_r(ir,3)**2 + rho%of_r(ir,4)**2 ) +! ! +! rhox = rho%of_r(ir,1) + rho_core(ir) +! ! +! IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) +! ! +! arhox = ABS( rhox ) +! ! +! IF ( arhox > vanishing_charge ) THEN +! ! +! zeta = amag / arhox +! ! +! IF ( ABS( zeta ) > 1.D0 ) THEN +! ! +! rhoneg(2) = rhoneg(2) + 1.D0 / omega +! ! +! zeta = SIGN( 1.D0, zeta ) +! ! +! END IF +! ! +! CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) ) +! ! +! vs = 0.5D0*( vx(1) + vc(1) - vx(2) - vc(2) ) +! ! +! v(ir,1) = e2*( 0.5D0*( vx(1) + vc(1) + vx(2) + vc(2 ) ) ) +! ! +! IF ( amag > vanishing_mag ) THEN +! ! +! DO ipol = 2, 4 +! ! +! v(ir,ipol) = e2 * vs * rho%of_r(ir,ipol) / amag +! ! +! vtxc = vtxc + v(ir,ipol) * rho%of_r(ir,ipol) +! ! +! END DO +! ! +! END IF +! ! +! etxc = etxc + e2*( ex + ec ) * rhox +! vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) +! ! +! END IF +! ! +! END DO ! END IF + CALL stop_clock( 'spin' ) ! CALL mp_sum( rhoneg , intra_bgrp_comm ) ! @@ -1755,16 +1807,15 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt ! ! ... add gradient corrections (if any) ! - !rho%of_r_d = rho%of_r - !rho%of_g_d = rho%of_g - !rho_core_d = rho_core - !rhog_core_d = rhog_core - !v_d = v - CALL gradcorr_gpu( rho%of_r, rho%of_r_d, rho%of_g, rho%of_g_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d ) + !CALL gradcorr_gpu( rho%of_r, rho%of_r_d, rho%of_g, rho%of_g_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d ) + CALL gradcorr_gpu(rho%of_r_d, rho%of_g_d, rho_core_d, rhog_core_d, etxc, vtxc, v_d ) ! ! ... add non local corrections (if any) ! - IF ( dft_is_nonlocc() ) CALL nlc( rho%of_r, rho_core, nspin, etxc, vtxc, v ) + IF ( dft_is_nonlocc() ) THEN + print*, "v_xc_gpu: dft_is_nonlocc not implemented!"; flush(6); stop + !CALL nlc( rho%of_r, rho_core, nspin, etxc, vtxc, v ) + END IF ! CALL mp_sum( vtxc , intra_bgrp_comm ) CALL mp_sum( etxc , intra_bgrp_comm ) @@ -1776,7 +1827,7 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt END SUBROUTINE v_xc_gpu ! !---------------------------------------------------------------------------- -SUBROUTINE v_h_gpu( rhog, ehart, charge, v ) +SUBROUTINE v_h_gpu( rhog_d, ehart, charge, v_d ) !---------------------------------------------------------------------------- ! ! ... Hartree potential VH(r) from n(G) @@ -1785,7 +1836,7 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v ) USE kinds, ONLY : DP USE fft_base, ONLY : dfftp USE fft_interfaces,ONLY : invfft - USE gvect, ONLY : nl, nlm, ngm, gg, gstart + USE gvect, ONLY : nl, nl_d, nlm, ngm, gg, gg_d, gstart USE lsda_mod, ONLY : nspin USE cell_base, ONLY : omega, tpiba2 USE control_flags, ONLY : gamma_only @@ -1796,27 +1847,33 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v ) ! IMPLICIT NONE ! - COMPLEX(DP), INTENT(IN) :: rhog(ngm,nspin) - REAL(DP), INTENT(INOUT) :: v(dfftp%nnr,nspin) + COMPLEX(DP), INTENT(IN), device :: rhog_d(ngm,nspin) + REAL(DP), INTENT(INOUT), device :: v_d(dfftp%nnr,nspin) REAL(DP), INTENT(OUT) :: ehart, charge ! REAL(DP) :: fac - REAL(DP), ALLOCATABLE :: aux1(:,:) + REAL(DP), ALLOCATABLE, device :: aux1_d(:,:) REAL(DP) :: rgtot_re, rgtot_im, eh_corr INTEGER :: is, ig - COMPLEX(DP), ALLOCATABLE :: aux(:), rgtot(:), vaux(:) - INTEGER :: nt + COMPLEX(DP), ALLOCATABLE :: rgtot(:), vaux(:) + COMPLEX(DP), ALLOCATABLE, DEVICE :: aux_d(:) + INTEGER :: nt, i + complex(DP) :: rhog11, rhog12 ! CALL start_clock( 'v_h' ) ! - ALLOCATE( aux( dfftp%nnr ), aux1( 2, ngm ) ) + ALLOCATE( aux_d( dfftp%nnr ), aux1_d(2, ngm)) charge = 0.D0 ! IF ( gstart == 2 ) THEN ! - charge = omega*REAL( rhog(1,1) ) + rhog11 = rhog_d(1,1) + charge = omega*REAL( rhog11 ) ! - IF ( nspin == 2 ) charge = charge + omega*REAL( rhog(1,2) ) + IF ( nspin == 2 ) THEN + rhog12 = rhog_d(1,2) + charge = charge + omega*REAL( rhog12 ) + END IF ! END IF ! @@ -1825,44 +1882,48 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v ) ! ... calculate hartree potential in G-space (NB: V(G=0)=0 ) ! IF ( do_comp_esm .and. ( esm_bc .ne. 'pbc' ) ) THEN + print*, "vh_gpu: do_comp_esm not implemented!"; flush(6); stop ! ! ... calculate modified Hartree potential for ESM ! - CALL esm_hartree (rhog, ehart, aux) + !CALL esm_hartree (rhog, ehart, aux) ! ELSE ! ehart = 0.D0 - aux1(:,:) = 0.D0 + aux1_d(:,:) = 0.D0 ! -!$omp parallel do private( fac, rgtot_re, rgtot_im ), reduction(+:ehart) + !$cuf kernel do(1) <<<*,*>>> DO ig = gstart, ngm ! - fac = 1.D0 / gg(ig) + fac = 1.D0 / gg_d(ig) ! - rgtot_re = REAL( rhog(ig,1) ) - rgtot_im = AIMAG( rhog(ig,1) ) + rgtot_re = REAL( rhog_d(ig,1) ) + rgtot_im = AIMAG( rhog_d(ig,1) ) ! IF ( nspin == 2 ) THEN ! - rgtot_re = rgtot_re + REAL( rhog(ig,2) ) - rgtot_im = rgtot_im + AIMAG( rhog(ig,2) ) + rgtot_re = rgtot_re + REAL( rhog_d(ig,2) ) + rgtot_im = rgtot_im + AIMAG( rhog_d(ig,2) ) ! END IF ! ehart = ehart + ( rgtot_re**2 + rgtot_im**2 ) * fac ! - aux1(1,ig) = rgtot_re * fac - aux1(2,ig) = rgtot_im * fac + aux1_d(1,ig) = rgtot_re * fac + aux1_d(2,ig) = rgtot_im * fac ! ENDDO -!$omp end parallel do ! fac = e2 * fpi / tpiba2 ! ehart = ehart * fac ! - aux1 = aux1 * fac + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(aux1_d,2), ubound(aux1_d,2) + aux1_d(1,i) = aux1_d(1,i) * fac + aux1_d(2,i) = aux1_d(2,i) * fac + end do ! IF ( gamma_only ) THEN ! @@ -1875,50 +1936,60 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v ) END IF ! if (do_comp_mt) then - ALLOCATE( vaux( ngm ), rgtot(ngm) ) - rgtot(:) = rhog(:,1) - if (nspin==2) rgtot(:) = rgtot(:) + rhog(:,2) - CALL wg_corr_h (omega, ngm, rgtot, vaux, eh_corr) - aux1(1,1:ngm) = aux1(1,1:ngm) + REAL( vaux(1:ngm)) - aux1(2,1:ngm) = aux1(2,1:ngm) + AIMAG(vaux(1:ngm)) - ehart = ehart + eh_corr - DEALLOCATE( rgtot, vaux ) + print*, "v_h_gpu: do_comp_mt not implemented!"; flush(6); stop + !ALLOCATE( vaux( ngm ), rgtot(ngm) ) + !rgtot(:) = rhog(:,1) + !if (nspin==2) rgtot(:) = rgtot(:) + rhog(:,2) + !CALL wg_corr_h (omega, ngm, rgtot, vaux, eh_corr) + !aux1(1,1:ngm) = aux1(1,1:ngm) + REAL( vaux(1:ngm)) + !aux1(2,1:ngm) = aux1(2,1:ngm) + AIMAG(vaux(1:ngm)) + !ehart = ehart + eh_corr + !DEALLOCATE( rgtot, vaux ) end if ! CALL mp_sum( ehart , intra_bgrp_comm ) ! - aux(:) = 0.D0 + aux_d(:) = 0.D0 ! - aux(nl(1:ngm)) = CMPLX ( aux1(1,1:ngm), aux1(2,1:ngm), KIND=dp ) + !$cuf kernel do(1) <<<*,*>>> + do i = 1, ngm + aux_d(nl_d(i)) = CMPLX ( aux1_d(1,i), aux1_d(2,i), KIND=dp ) + end do ! IF ( gamma_only ) THEN ! - aux(nlm(1:ngm)) = CMPLX ( aux1(1,1:ngm), -aux1(2,1:ngm), KIND=dp ) + print*, "v_h_gpu: gamma only not implemented!"; flush(6); stop + !aux(nlm(1:ngm)) = CMPLX ( aux1_d(1,1:ngm), -aux1_d(2,1:ngm), KIND=dp ) ! END IF END IF ! ! ... transform hartree potential to real space ! - CALL invfft ('Dense', aux, dfftp) + CALL invfft ('Dense', aux_d, dfftp) ! ! ... add hartree potential to the xc potential ! IF ( nspin == 4 ) THEN ! - v(:,1) = v(:,1) + DBLE (aux(:)) + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(v_d, 1), ubound(v_d, 1) + v_d(i,1) = v_d(i,1) + DBLE (aux_d(i)) + end do ! ELSE ! DO is = 1, nspin - ! - v(:,is) = v(:,is) + DBLE (aux(:)) - ! + !$cuf kernel do(1) <<<*,*>>> + do i = lbound(v_d, 1), ubound(v_d, 1) + v_d(i,is) = v_d(i,is) + DBLE (aux_d(i)) + end do END DO ! END IF + ! - DEALLOCATE( aux, aux1 ) + DEALLOCATE( aux_d, aux1_d) ! CALL stop_clock( 'v_h' ) !