From 9d25bf6c73726bac9ee1628e98ec325d5d6f53df Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Mon, 26 Jun 2017 16:27:40 -0700
Subject: [PATCH] v_of_rho (in particular configuration) and and some more ffts
 ported to GPU. Also added some missing deallocations to PW cleanup.

---
 PW/src/clean_pw.f90  |   8 +-
 PW/src/electrons.f90 |  43 ++-
 PW/src/gradcorr.f90  | 606 ++++++++++++++++++++++++++++---------------
 PW/src/scf_mod.f90   |  13 +-
 PW/src/set_vrs.f90   |  81 ++++++
 PW/src/sum_band.f90  |  30 ++-
 PW/src/v_of_rho.f90  | 499 ++++++++++++++++++++---------------
 7 files changed, 836 insertions(+), 444 deletions(-)

diff --git a/PW/src/clean_pw.f90 b/PW/src/clean_pw.f90
index e0709b3d3..763d4e82c 100644
--- a/PW/src/clean_pw.f90
+++ b/PW/src/clean_pw.f90
@@ -73,7 +73,7 @@ SUBROUTINE clean_pw( lflag )
                                    eigts1_d, eigts2_d, eigts3_d
   USE gvecs,                ONLY : nls_d
   USE wvfct,                ONLY : g2kin_d, et_d, wg_d
-  USE us,                   ONLY : qrad_d
+  USE us,                   ONLY : qrad_d, tab_d, tab_d2y_d
 #endif
   IMPLICIT NONE
   !
@@ -150,6 +150,9 @@ SUBROUTINE clean_pw( lflag )
   IF ( ALLOCATED( vrs ) )        DEALLOCATE( vrs )
   if (spline_ps) then
     IF ( ALLOCATED( tab_d2y) )     DEALLOCATE( tab_d2y )
+#ifdef USE_CUDA
+    IF ( ALLOCATED( tab_d2y_d) )     DEALLOCATE( tab_d2y_d )
+#endif
   endif
   IF ( ALLOCATED( nls ) )     DEALLOCATE( nls )
   IF ( ALLOCATED( nlsm ) )   DEALLOCATE( nlsm )
@@ -178,6 +181,9 @@ SUBROUTINE clean_pw( lflag )
   IF ( ALLOCATED( qrad ) )       DEALLOCATE( qrad )
   IF ( ALLOCATED( tab ) )        DEALLOCATE( tab )
   IF ( ALLOCATED( tab_at ) )     DEALLOCATE( tab_at )
+#ifdef USE_CUDA
+  IF ( ALLOCATED( tab_d ) )        DEALLOCATE( tab_d )
+#endif
   IF ( lspinorb ) THEN
      IF ( ALLOCATED( fcoef ) )   DEALLOCATE( fcoef )
   END IF
diff --git a/PW/src/electrons.f90 b/PW/src/electrons.f90
index e34ef062e..dc47d94b1 100644
--- a/PW/src/electrons.f90
+++ b/PW/src/electrons.f90
@@ -372,9 +372,10 @@ SUBROUTINE electrons_scf ( printout, exxen )
   USE plugin_variables,     ONLY : plugin_etot
   !
 #ifdef USE_CUDA
+  USE funct,                ONLY : get_iexch, get_icorr, get_igcx, get_igcc
   USE dfunct,               ONLY : newd_gpu
   USE cudafor
-  USE scf,                  ONLY : rho_core_d, rhog_core_d, vltot_d, vrs_d 
+  USE scf,                  ONLY : rho_core_d, rhog_core_d, vltot_d, vrs_d
 #endif
   IMPLICIT NONE
   !
@@ -402,7 +403,7 @@ SUBROUTINE electrons_scf ( printout, exxen )
   LOGICAL :: &
       first, exst
 #ifdef USE_CUDA
-   INTEGER :: istat
+   INTEGER :: istat, iexch, icorr, igcx, igcc
 #endif
   !
   ! ... auxiliary variables for calculating and storing temporary copies of
@@ -649,9 +650,32 @@ SUBROUTINE electrons_scf ( printout, exxen )
                           ehart, etxc, vtxc, eth, etotefield, charge, v)
            !CALL v_of_rho( rhoin, rho_core, rhog_core, &
            !               ehart, etxc, vtxc, eth, etotefield, charge, v)
+#else
+#ifdef USE_CUDA
+           iexch = get_iexch
+           icorr = get_icorr
+           igcx = get_igcx
+           igcc = get_igcc
+
+           ! If calling PBE functional configuration, use GPU path
+           if (iexch .eq. 1 .and. icorr .eq. 4 .and. igcx .eq. 3 .and. igcc .eq. 4) then
+             rho_core_d = rho_core
+             rhog_core_d = rhog_core
+             v%of_r_d = v%of_r
+             rhoin%of_r_d = rhoin%of_r
+             rhoin%of_g_d = rhoin%of_g
+             CALL v_of_rho_gpu( rhoin, rho_core, rho_core_d, rhog_core, rhog_core_d,&
+                            ehart, etxc, vtxc, eth, etotefield, charge, v)
+
+           ! Otherwise, fallback to CPU path
+           else
+             CALL v_of_rho( rhoin, rho_core, rhog_core, &
+                            ehart, etxc, vtxc, eth, etotefield, charge, v)
+           endif
 #else
            CALL v_of_rho( rhoin, rho_core, rhog_core, &
                           ehart, etxc, vtxc, eth, etotefield, charge, v)
+#endif
 #endif
            IF (okpaw) THEN
               CALL PAW_potential(rhoin%bec, ddd_paw, epaw,etot_cmp_paw)
@@ -708,15 +732,24 @@ SUBROUTINE electrons_scf ( printout, exxen )
      !
      ! ... define the total local potential (external + scf)
      !
+#ifdef USE_CUDA
+     v%of_r_d = v%of_r
+     vltot_d = vltot
+     CALL sum_vrs_gpu( dfftp%nnr, nspin, vltot_d, v%of_r_d, vrs_d )
+#else
      CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs )
+#endif
      !
      ! ... interpolate the total local potential
      !
-     CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs )
-     !
 #ifdef USE_CUDA
-     vrs_d = vrs
+       CALL interpolate_vrs_gpu( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs_d)
+       vrs = vrs_d
+#else
+       CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs )
 #endif
+     !vrs_d = vrs
+     !
      ! ... in the US case we have to recompute the self-consistent
      ! ... term in the nonlocal potential
      ! ... PAW: newd contains PAW updates of NL coefficients
diff --git a/PW/src/gradcorr.f90 b/PW/src/gradcorr.f90
index e028f1902..00cc4dfac 100644
--- a/PW/src/gradcorr.f90
+++ b/PW/src/gradcorr.f90
@@ -1068,16 +1068,16 @@ subroutine pw_gpu (rs, ec, vc)
 end subroutine pw_gpu
 
 
-SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d )
+SUBROUTINE gradcorr_gpu(rho_d, rhog_d, rho_core_d, rhog_core_d, etxc, vtxc, v_d )
   !----------------------------------------------------------------------------
   !
-  USE constants,            ONLY : e2
+  USE constants,            ONLY : e2, pi
   USE kinds,                ONLY : DP
-  USE gvect,                ONLY : nl, ngm, g
+  USE gvect,                ONLY : nl, nl_d, ngm, g, g_d
   USE lsda_mod,             ONLY : nspin
   USE cell_base,            ONLY : omega, alat
   USE funct,                ONLY : gcxc, gcx_spin, gcc_spin, igcc_is_lyp, &
-                                   gcc_spin_more, dft_is_gradient, get_igcc
+                                   gcc_spin_more, dft_is_gradient, get_igcc, get_igcx
   USE spin_orb,             ONLY : domag
   USE noncollin_module,     ONLY : ux
   USE wavefunctions_module, ONLY : psic
@@ -1087,32 +1087,52 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co
   !
   IMPLICIT NONE
   !
-  REAL(DP),    INTENT(IN)    :: rho(dfftp%nnr,nspin), rho_core(dfftp%nnr)
-  COMPLEX(DP), INTENT(IN)    :: rhog(ngm,nspin), rhog_core(ngm)
-  REAL(DP),    INTENT(INOUT) :: v(dfftp%nnr,nspin)
+  REAL(DP),    INTENT(IN), device    :: rho_d(dfftp%nnr,nspin), rho_core_d(dfftp%nnr)
+  COMPLEX(DP), INTENT(IN), device    :: rhog_d(ngm,nspin), rhog_core_d(ngm)
+  REAL(DP),    INTENT(INOUT), device :: v_d(dfftp%nnr,nspin)
   REAL(DP),    INTENT(INOUT) :: vtxc, etxc
-  !
-  REAL(DP),    DEVICE, INTENT(IN)    :: rho_d(dfftp%nnr,nspin), rho_core_d(dfftp%nnr)
-  COMPLEX(DP), DEVICE, INTENT(IN)    :: rhog_d(ngm,nspin), rhog_core_d(ngm)
-  REAL(DP),    DEVICE, INTENT(INOUT) :: v_d(dfftp%nnr,nspin)
+
   !
   INTEGER :: k, ipol, is, nspin0, ir, jpol
   !
-  REAL(DP),    ALLOCATABLE :: grho(:,:,:), h(:,:,:), dh(:)
-  REAL(DP),    ALLOCATABLE :: rhoout(:,:), segni(:), vgg(:,:), vsave(:,:)
+  REAL(DP),    ALLOCATABLE :: segni(:), vgg(:,:), vsave(:,:)
   REAL(DP),    ALLOCATABLE :: gmag(:,:,:)
 
-  COMPLEX(DP), ALLOCATABLE :: rhogsum(:,:)
+  REAL(DP),    ALLOCATABLE, device :: grho_d(:,:,:), h_d(:,:,:), dh_d(:), rhoout_d(:,:)
+
+  COMPLEX(DP), ALLOCATABLE, device :: rhogsum_d(:,:)
   !
-  REAL(DP) :: grho2(2), sx, sc, v1x, v2x, v1c, v2c, &
+  REAL(DP) :: grho1, grho2(2), sx, sc, v1x, v2x, v1c, v2c, &
               v1xup, v1xdw, v2xup, v2xdw, v1cup, v1cdw , &
               etxcgc, vtxcgc, segno, arho, fac, zeta, rh, grh2, amag 
   REAL(DP) :: v2cup, v2cdw,  v2cud, rup, rdw, &
               grhoup, grhodw, grhoud, grup, grdw, seg
   !
   REAL(DP), PARAMETER :: epsr = 1.D-6, epsg = 1.D-10
+  integer :: i, j, igcx, igcc
+
+  ! parameters from expanded gcxc subroutine
+  real(DP), parameter:: small = 1.E-10_DP
+  real(DP) :: kf, agrho, s1, s2, ds, dsg, exunif, fx
+  real(DP) :: dxunif, dfx, f1, f2, f3, dfx1
+  real(DP) :: p,  amu, ab, c, dfxdp, dfxds, upbe, uge, s, ak, aa
+  real(DP), parameter :: third = 1._DP / 3._DP, c1 = 0.75_DP / pi , &
+       c2 = 3.093667726280136_DP, c5 = 4._DP * third, &
+       c6 = c2*2.51984210, c7=5._DP/6._DP, c8=0.8_DP, & ! (3pi^2)^(1/3)*2^(4/3)
+       k1 = 0.804_DP, mu1 = 0.21951_DP 
   !
+  real(DP), parameter :: ga = 0.031091d0, be1 = 0.066725d0
+  real(DP), parameter :: pi34 = 0.6203504908994d0
+  real(DP), parameter :: xkf = 1.919158292677513d0, xks = 1.128379167095513d0
+  real(DP) :: ks, rs, ec, vc, t, expe, af, bf, y, xy, qy
+  real(DP) :: h0, dh0, ddh0, sc2D, v1c2D, v2c2D
+  !
+  real(DP) :: a, b1, b2, c0, d0, d1
+  parameter (a = 0.031091d0, b1 = 7.5957d0, b2 = 3.5876d0, c0 = a)
+  real(DP) :: lnrs, rs12, rs32, rs2, om, dom, olog
+  real(DP), parameter :: a11 = 0.21370d0, b31 = 1.6382d0, b41 = 0.49294d0
 
+  !
   !
   IF ( .NOT. dft_is_gradient() ) RETURN
   !
@@ -1126,199 +1146,333 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co
   if (nspin==4.and.domag) nspin0=2
   fac = 1.D0 / DBLE( nspin0 )
   !
-  ALLOCATE(    h( 3, dfftp%nnr, nspin0) )
-  ALLOCATE( grho( 3, dfftp%nnr, nspin0) )
-  ALLOCATE( rhoout( dfftp%nnr, nspin0) )
-  IF (nspin==4.AND.domag) THEN
-     ALLOCATE( vgg( dfftp%nnr, nspin0 ) )
-     ALLOCATE( vsave( dfftp%nnr, nspin ) )
-     ALLOCATE( segni( dfftp%nnr ) )
-     vsave=v
-     v=0.d0
-  ENDIF
+  ALLOCATE(    h_d( 3, dfftp%nnr, nspin0) )
+  ALLOCATE( grho_d( 3, dfftp%nnr, nspin0) )
+  ALLOCATE( rhoout_d( dfftp%nnr, nspin0) )
+!  IF (nspin==4.AND.domag) THEN
+!     ALLOCATE( vgg( dfftp%nnr, nspin0 ) )
+!     ALLOCATE( vsave( dfftp%nnr, nspin ) )
+!     ALLOCATE( segni( dfftp%nnr ) )
+!     vsave=v
+!     v=0.d0
+!  ENDIF
   !
-  ALLOCATE( rhogsum( ngm, nspin0 ) )
+  ALLOCATE( rhogsum_d( ngm, nspin0 ) )
   !
   ! ... calculate the gradient of rho + rho_core in real space
   !
   IF ( nspin == 4 .AND. domag ) THEN
-     !
-     CALL compute_rho(rho,rhoout,segni,dfftp%nnr)
-     !
-     ! ... bring starting rhoout to G-space
-     !
-     DO is = 1, nspin0
-        !
-        psic(:) = rhoout(:,is)
-        !
-        CALL fwfft ('Dense', psic, dfftp)
-        !
-        rhogsum(:,is) = psic(nl(:))
-        !
-     END DO
+     print*, "gradcorr_gpu: nspin == 4 .and. domag not supported!"; flush(6); stop
+!     !
+!     CALL compute_rho(rho,rhoout,segni,dfftp%nnr)
+!     !
+!     ! ... bring starting rhoout to G-space
+!     !
+!     DO is = 1, nspin0
+!        !
+!        psic(:) = rhoout(:,is)
+!        !
+!        CALL fwfft ('Dense', psic, dfftp)
+!        !
+!        rhogsum(:,is) = psic(nl(:))
+!        !
+!     END DO
   ELSE
      !
-     rhoout(:,1:nspin0)  = rho(:,1:nspin0)
-     rhogsum(:,1:nspin0) = rhog(:,1:nspin0)
+     !$cuf kernel do(2) <<<*,*>>>
+     do j = 1, nspin0
+       do i = lbound(rhoout_d,1), ubound(rhoout_d,1)
+         rhoout_d(i,j)  = rho_d(i,j)
+       end do
+     end do
+
+     !$cuf kernel do(2) <<<*,*>>>
+     do j = 1, nspin0
+       do i = lbound(rhogsum_d,1), ubound(rhogsum_d,1)
+         rhogsum_d(i,j)  = rhog_d(i,j)
+       end do
+     end do
      !
   ENDIF
 
   DO is = 1, nspin0
      !
-     rhoout(:,is)  = fac * rho_core(:)  + rhoout(:,is)
-     rhogsum(:,is) = fac * rhog_core(:) + rhogsum(:,is)
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(rhoout_d,1), ubound(rhoout_d,1)
+       rhoout_d(i,is)  = fac * rho_core_d(i)  + rhoout_d(i,is)
+     end do
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(rhogsum_d,1), ubound(rhogsum_d,1)
+       rhogsum_d(i,is) = fac * rhog_core_d(i) + rhogsum_d(i,is)
+     end do
      !
 call start_clock( 'gradrho' )
-     CALL gradrho( dfftp%nnr, rhogsum(1,is), ngm, g, nl, grho(1,1,is) )
+     CALL gradrho_gpu( dfftp%nnr, rhogsum_d(1,is), ngm, g_d, nl_d, grho_d(1,1,is) )
 call stop_clock( 'gradrho' )
+     
      !
   END DO
-
   !
-  DEALLOCATE( rhogsum )
+  DEALLOCATE( rhogsum_d )
   !
   IF ( nspin0 == 1 ) THEN
-     !
-     ! ... This is the spin-unpolarised case
-     !
-     DO k = 1, dfftp%nnr
+    igcx = get_igcx
+    igcc = get_igcc
+
+    !
+    IF (igcx .eq. 3 .and. igcc .eq. 4) THEN
+      !
+      ! ... This is the spin-unpolarised case (on GPU, only for igcx == 3 and igcc == 4 config)
+      !
+      !$cuf kernel do(1) <<<*, *>>>
+      DO k = 1, dfftp%nnr
         !
-        arho = ABS( rhoout(k,1) )
+        arho = ABS( rhoout_d(k,1) )
         !
         IF ( arho > epsr ) THEN
            !
-           grho2(1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
+           grho1 = grho_d(1,k,1)**2 + grho_d(2,k,1)**2 + grho_d(3,k,1)**2
            !
-           IF ( grho2(1) > epsg ) THEN
+           IF ( grho1 > epsg ) THEN
               !
-              segno = SIGN( 1.D0, rhoout(k,1) )
+              segno = SIGN( 1.D0, rhoout_d(k,1) )
               !
-              CALL gcxc_gpu( arho, grho2(1), sx, sc, v1x, v2x, v1c, v2c )
+              !CALL gcxc_gpu( arho, grho1, sx, sc, v1x, v2x, v1c, v2c )
+              if (arho <= small) then
+                 sx = 0.0_DP
+                 v1x = 0.0_DP
+                 v2x = 0.0_DP
+              else
+            !     call pbex (rho, grho, 1, sx, v1x, v2x)
+                 agrho = sqrt (grho1)
+                 kf = c2 * arho**third
+                 dsg = 0.5_DP / kf
+                 s1 = agrho * dsg / arho
+                 s2 = s1 * s1
+                 ds = - c5 * s1
+                 f1 = s2 * mu1 / k1
+                 f2 = 1._DP + f1
+                 f3 = k1 / f2
+                 fx = k1 - f3
+                 exunif = - c1 * kf
+                 sx = exunif * fx
+                 dxunif = exunif * third
+                 dfx1 = f2 * f2
+                 dfx = 2._DP * mu1 * s1 / dfx1
+                 v1x = sx + dxunif * fx + exunif * dfx * ds
+                 v2x = exunif * dfx * dsg / agrho
+                 sx = sx * arho
+              endif
+
+              if (arho.le.small) then
+                 sc = 0.0_DP
+                 v1c = 0.0_DP
+                 v2c = 0.0_DP
+              else
+                 !call pbec (rho, grho, 1, sc, v1c, v2c)
+                 rs = pi34 / arho**third
+                 !
+                 !call pw_gpu (rs, ec, vc)
+                 rs12 = sqrt (rs)
+                 rs32 = rs * rs12
+                 rs2 = rs**2
+                 om = 2.d0 * a * (b1 * rs12 + b2 * rs + b31 * rs32 + b41 * rs2)
+                 dom = 2.d0 * a * (0.5d0 * b1 * rs12 + b2 * rs + 1.5d0 * b31 * rs32 + 2.d0 * b41 * rs2)
+                 olog = log (1.d0 + 1.0d0 / om)
+                 ec = - 2.d0 * a * (1.d0 + a11 * rs) * olog
+                 vc = - 2.d0 * a * (1.d0 + 2.d0 / 3.d0 * a11 * rs) &
+                      * olog - 2.d0 / 3.d0 * a * (1.d0 + a11 * rs) * dom / (om * (om + 1.d0) )
+                 !
+                 kf = xkf / rs
+                 ks = xks * sqrt (kf)
+                 t = sqrt (grho1) / (2.d0 * ks * arho)
+                 expe = exp ( - ec / ga)
+                 af = be1 / ga * (1.d0 / (expe-1.d0) )
+                 bf = expe * (vc - ec)
+                 y = af * t * t
+                 xy = (1.d0 + y) / (1.d0 + y + y * y)
+                 qy = y * y * (2.d0 + y) / (1.d0 + y + y * y) **2
+                 s1 = 1.d0 + be1 / ga * t * t * xy
+                 h0 = ga * log (s1)
+                 dh0 = be1 * t * t / s1 * ( - 7.d0 / 3.d0 * xy - qy * (af * bf / &
+                      be1-7.d0 / 3.d0) )
+                 ddh0 = be1 / (2.d0 * ks * ks * arho) * (xy - qy) / s1
+                 sc = arho * h0
+                 v1c = h0 + dh0
+                 v2c = ddh0
+              endif
               !
               ! ... first term of the gradient correction : D(rho*Exc)/D(rho)
               !
-              v(k,1) = v(k,1) + e2 * ( v1x + v1c )
+              v_d(k,1) = v_d(k,1) + e2 * ( v1x + v1c )
               !
               ! ... h contains :
               !
               ! ...    D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho|
               !
-              h(:,k,1) = e2 * ( v2x + v2c ) * grho(:,k,1)
+              h_d(:,k,1) = e2 * ( v2x + v2c ) * grho_d(:,k,1)
               !
-              vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout(k,1) - rho_core(k) )
+              vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout_d(k,1) - rho_core_d(k) )
               etxcgc = etxcgc+e2*( sx + sc ) * segno
               !
            ELSE
-              h(:,k,1)=0.D0
+              h_d(:,k,1)=0.D0
            END IF
            !
-        ELSE
-           !
-           h(:,k,1) = 0.D0
-           !
-        END IF
-        !
-     END DO
-     !
+          ELSE
+             !
+             h_d(:,k,1) = 0.D0
+             !
+          END IF
+          !
+       END DO
+
+    ELSE
+      print*, "gradcorr_gpu: should not be here!"; flush(6); stop
+!      !
+!      ! ... This is the spin-unpolarised case 
+!      !
+!      DO k = 1, dfftp%nnr
+!         !
+!         arho = ABS( rhoout(k,1) )
+!         !
+!         IF ( arho > epsr ) THEN
+!            !
+!            grho2(1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
+!            !
+!            IF ( grho2(1) > epsg ) THEN
+!               !
+!               segno = SIGN( 1.D0, rhoout(k,1) )
+!               !
+!               CALL gcxc( arho, grho2(1), sx, sc, v1x, v2x, v1c, v2c )
+!               !
+!               ! ... first term of the gradient correction : D(rho*Exc)/D(rho)
+!               !
+!               v(k,1) = v(k,1) + e2 * ( v1x + v1c )
+!               !
+!               ! ... h contains :
+!               !
+!               ! ...    D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho|
+!               !
+!               h(:,k,1) = e2 * ( v2x + v2c ) * grho(:,k,1)
+!               !
+!               vtxcgc = vtxcgc+e2*( v1x + v1c ) * ( rhoout(k,1) - rho_core(k) )
+!               etxcgc = etxcgc+e2*( sx + sc ) * segno
+!               !
+!            ELSE
+!               h(:,k,1)=0.D0
+!            END IF
+!            !
+!         ELSE
+!            !
+!            h(:,k,1) = 0.D0
+!            !
+!         END IF
+!         !
+!      END DO
+    ENDIF
+!     !
   ELSE
      !
      ! ... spin-polarised case
      !
-!$omp parallel do private( rh, grho2, sx, v1xup, v1xdw, v2xup, v2xdw, rup, rdw, &
-!$omp             grhoup, grhodw, grhoud, sc, v1cup, v1cdw, v2cup, v2cdw, v2cud, &
-!$omp             zeta, grh2, v2c, grup, grdw  ), &
-!$omp             reduction(+:etxcgc,vtxcgc)
-     DO k = 1, dfftp%nnr
-        !
-        rh = rhoout(k,1) + rhoout(k,2)
-        !
-        grho2(:) = grho(1,k,:)**2 + grho(2,k,:)**2 + grho(3,k,:)**2
-        !
-        CALL gcx_spin( rhoout(k,1), rhoout(k,2), grho2(1), &
-                       grho2(2), sx, v1xup, v1xdw, v2xup, v2xdw )
-        !
-        IF ( rh > epsr ) THEN
-           !
-           IF ( igcc_is_lyp() ) THEN
-              !
-              rup = rhoout(k,1)
-              rdw = rhoout(k,2)
-              !
-              grhoup = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
-              grhodw = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2
-              !
-              grhoud = grho(1,k,1) * grho(1,k,2) + &
-                       grho(2,k,1) * grho(2,k,2) + &
-                       grho(3,k,1) * grho(3,k,2)
-              !
-              CALL gcc_spin_more( rup, rdw, grhoup, grhodw, grhoud, &
-                                  sc, v1cup, v1cdw, v2cup, v2cdw, v2cud )
-              !
-           ELSE
-              !
-              zeta = ( rhoout(k,1) - rhoout(k,2) ) / rh
-              if (nspin.eq.4.and.domag) zeta=abs(zeta)*segni(k)
-              !
-              grh2 = ( grho(1,k,1) + grho(1,k,2) )**2 + &
-                     ( grho(2,k,1) + grho(2,k,2) )**2 + &
-                     ( grho(3,k,1) + grho(3,k,2) )**2
-              !
-              CALL gcc_spin( rh, zeta, grh2, sc, v1cup, v1cdw, v2c )
-              !
-              v2cup = v2c
-              v2cdw = v2c
-              v2cud = v2c
-              !
-           END IF
-           !
-        ELSE
-           !
-           sc    = 0.D0
-           v1cup = 0.D0
-           v1cdw = 0.D0
-           v2c   = 0.D0
-           v2cup = 0.D0
-           v2cdw = 0.D0
-           v2cud = 0.D0
-           !
-        ENDIF
-        !
-        ! ... first term of the gradient correction : D(rho*Exc)/D(rho)
-        !
-        v(k,1) = v(k,1) + e2 * ( v1xup + v1cup )
-        v(k,2) = v(k,2) + e2 * ( v1xdw + v1cdw )
-        !
-        ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
-        !
-        DO ipol = 1, 3
-           !
-           grup = grho(ipol,k,1)
-           grdw = grho(ipol,k,2)
-           h(ipol,k,1) = e2 * ( ( v2xup + v2cup ) * grup + v2cud * grdw )
-           h(ipol,k,2) = e2 * ( ( v2xdw + v2cdw ) * grdw + v2cud * grup )
-           !
-        END DO
-        !
-        vtxcgc = vtxcgc + &
-                 e2 * ( v1xup + v1cup ) * ( rhoout(k,1) - rho_core(k) * fac )
-        vtxcgc = vtxcgc + &
-                 e2 * ( v1xdw + v1cdw ) * ( rhoout(k,2) - rho_core(k) * fac )
-        etxcgc = etxcgc + e2 * ( sx + sc )
-        !
-     END DO
-!$omp end parallel do
+     print*, "gradcorr_gpu: SPIN POLARISED not supported!"; flush(6); stop
+!!$omp parallel do private( rh, grho2, sx, v1xup, v1xdw, v2xup, v2xdw, rup, rdw, &
+!!$omp             grhoup, grhodw, grhoud, sc, v1cup, v1cdw, v2cup, v2cdw, v2cud, &
+!!$omp             zeta, grh2, v2c, grup, grdw  ), &
+!!$omp             reduction(+:etxcgc,vtxcgc)
+!     DO k = 1, dfftp%nnr
+!        !
+!        rh = rhoout(k,1) + rhoout(k,2)
+!        !
+!        grho2(:) = grho(1,k,:)**2 + grho(2,k,:)**2 + grho(3,k,:)**2
+!        !
+!        CALL gcx_spin( rhoout(k,1), rhoout(k,2), grho2(1), &
+!                       grho2(2), sx, v1xup, v1xdw, v2xup, v2xdw )
+!        !
+!        IF ( rh > epsr ) THEN
+!           !
+!           IF ( igcc_is_lyp() ) THEN
+!              !
+!              rup = rhoout(k,1)
+!              rdw = rhoout(k,2)
+!              !
+!              grhoup = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
+!              grhodw = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2
+!              !
+!              grhoud = grho(1,k,1) * grho(1,k,2) + &
+!                       grho(2,k,1) * grho(2,k,2) + &
+!                       grho(3,k,1) * grho(3,k,2)
+!              !
+!              CALL gcc_spin_more( rup, rdw, grhoup, grhodw, grhoud, &
+!                                  sc, v1cup, v1cdw, v2cup, v2cdw, v2cud )
+!              !
+!           ELSE
+!              !
+!              zeta = ( rhoout(k,1) - rhoout(k,2) ) / rh
+!              if (nspin.eq.4.and.domag) zeta=abs(zeta)*segni(k)
+!              !
+!              grh2 = ( grho(1,k,1) + grho(1,k,2) )**2 + &
+!                     ( grho(2,k,1) + grho(2,k,2) )**2 + &
+!                     ( grho(3,k,1) + grho(3,k,2) )**2
+!              !
+!              CALL gcc_spin( rh, zeta, grh2, sc, v1cup, v1cdw, v2c )
+!              !
+!              v2cup = v2c
+!              v2cdw = v2c
+!              v2cud = v2c
+!              !
+!           END IF
+!           !
+!        ELSE
+!           !
+!           sc    = 0.D0
+!           v1cup = 0.D0
+!           v1cdw = 0.D0
+!           v2c   = 0.D0
+!           v2cup = 0.D0
+!           v2cdw = 0.D0
+!           v2cud = 0.D0
+!           !
+!        ENDIF
+!        !
+!        ! ... first term of the gradient correction : D(rho*Exc)/D(rho)
+!        !
+!        v(k,1) = v(k,1) + e2 * ( v1xup + v1cup )
+!        v(k,2) = v(k,2) + e2 * ( v1xdw + v1cdw )
+!        !
+!        ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
+!        !
+!        DO ipol = 1, 3
+!           !
+!           grup = grho(ipol,k,1)
+!           grdw = grho(ipol,k,2)
+!           h(ipol,k,1) = e2 * ( ( v2xup + v2cup ) * grup + v2cud * grdw )
+!           h(ipol,k,2) = e2 * ( ( v2xdw + v2cdw ) * grdw + v2cud * grup )
+!           !
+!        END DO
+!        !
+!        vtxcgc = vtxcgc + &
+!                 e2 * ( v1xup + v1cup ) * ( rhoout(k,1) - rho_core(k) * fac )
+!        vtxcgc = vtxcgc + &
+!                 e2 * ( v1xdw + v1cdw ) * ( rhoout(k,2) - rho_core(k) * fac )
+!        etxcgc = etxcgc + e2 * ( sx + sc )
+!        !
+!     END DO
+!!$omp end parallel do
      !
   END IF
   !
-  DO is = 1, nspin0
-     !
-     rhoout(:,is) = rhoout(:,is) - fac * rho_core(:)
-     !
-  END DO
+  !$cuf kernel do(2)
+  do is = 1, nspin0
+    do i = lbound(rhoout_d, 1), ubound(rhoout_d,1)
+      rhoout_d(i,is) = rhoout_d(i,is) - fac * rho_core_d(i)
+    end do
+  end do
   !
-  DEALLOCATE( grho )
+  DEALLOCATE( grho_d )
   !
-  ALLOCATE( dh( dfftp%nnr ) )    
+  ALLOCATE( dh_d( dfftp%nnr ) )    
   !
   ! ... second term of the gradient correction :
   ! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) )
@@ -1326,44 +1480,50 @@ SUBROUTINE gradcorr_gpu( rho, rho_d, rhog, rhog_d, rho_core, rho_core_d, rhog_co
   DO is = 1, nspin0
      !
 call start_clock( 'graddot' )
-     CALL grad_dot( dfftp%nnr, h(1,1,is), ngm, g, nl, alat, dh )
+     CALL grad_dot_gpu( dfftp%nnr, h_d(1,1,is), ngm, g_d, nl_d, alat, dh_d )
 call stop_clock( 'graddot' )
      !
-     v(:,is) = v(:,is) - dh(:)
+     !$cuf kernel do(1)
+     do i = lbound(v_d, 1), ubound(v_d,1)
+       v_d(i,is) = v_d(i,is) - dh_d(i)
+     end do
      !
-     vtxcgc = vtxcgc - SUM( dh(:) * rhoout(:,is) )
+     !$cuf kernel do(1)
+     do i = lbound(dh_d, 1), ubound(dh_d,1)
+       vtxcgc = vtxcgc - dh_d(i) * rhoout_d(i,is)
+     end do
      !
   END DO
+
   !
   vtxc = vtxc + omega * vtxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 )
   etxc = etxc + omega * etxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 )
 
-  IF (nspin==4.AND.domag) THEN
-     DO is=1,nspin0
-        vgg(:,is)=v(:,is)
-     ENDDO
-     v=vsave
-     DO k=1,dfftp%nnr
-        v(k,1)=v(k,1)+0.5d0*(vgg(k,1)+vgg(k,2))
-        amag=sqrt(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2)
-        IF (amag.GT.1.d-12) THEN
-           v(k,2)=v(k,2)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,2)/amag
-           v(k,3)=v(k,3)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,3)/amag
-           v(k,4)=v(k,4)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag
-        ENDIF
-     ENDDO
-  ENDIF
-  !
-  DEALLOCATE( dh )
-  DEALLOCATE( h )
-  DEALLOCATE( rhoout )
+!  IF (nspin==4.AND.domag) THEN
+!     DO is=1,nspin0
+!        vgg(:,is)=v(:,is)
+!     ENDDO
+!     v=vsave
+!     DO k=1,dfftp%nnr
+!        v(k,1)=v(k,1)+0.5d0*(vgg(k,1)+vgg(k,2))
+!        amag=sqrt(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2)
+!        IF (amag.GT.1.d-12) THEN
+!           v(k,2)=v(k,2)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,2)/amag
+!           v(k,3)=v(k,3)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,3)/amag
+!           v(k,4)=v(k,4)+segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag
+!        ENDIF
+!     ENDDO
+!  ENDIF
+  !
+  DEALLOCATE( dh_d )
+  DEALLOCATE( h_d )
+  DEALLOCATE( rhoout_d )
   IF (nspin==4.and.domag) THEN
      DEALLOCATE( vgg )
      DEALLOCATE( vsave )
      DEALLOCATE( segni )
   ENDIF
   !
-
   call stop_clock( 'gradcorr' )
   RETURN
   !
@@ -1387,16 +1547,17 @@ SUBROUTINE gradrho_gpu( nrxx, a, ngm, g, nl, ga )
   IMPLICIT NONE
   !
   INTEGER,     INTENT(IN)  :: nrxx
-  INTEGER,     INTENT(IN)  :: ngm, nl(ngm)
-  COMPLEX(DP), INTENT(IN)  :: a(ngm)
-  REAL(DP),    INTENT(IN)  :: g(3,ngm)
-  REAL(DP),    INTENT(OUT) :: ga(3,nrxx)
+  INTEGER,     INTENT(IN)  :: ngm
+  INTEGER,     INTENT(IN), device  :: nl(ngm)
+  COMPLEX(DP), INTENT(IN), device  :: a(ngm)
+  REAL(DP),    INTENT(IN), device  :: g(3,ngm)
+  REAL(DP),    INTENT(OUT), device :: ga(3,nrxx)
   !
-  INTEGER                  :: ipol
-  COMPLEX(DP), ALLOCATABLE :: gaux(:)
+  INTEGER                  :: i, ipol
+  COMPLEX(DP), ALLOCATABLE, device :: gaux_d(:)
   !
   !
-  ALLOCATE( gaux( nrxx ) )
+  ALLOCATE( gaux_d (nrxx) )
   !
   ! ... multiply by (iG) to get (\grad_ipol a)(G) ...
   !
@@ -1404,27 +1565,35 @@ SUBROUTINE gradrho_gpu( nrxx, a, ngm, g, nl, ga )
   !
   DO ipol = 1, 3
      !
-     gaux(:) = CMPLX(0.d0,0.d0,kind=dp)
+     gaux_d(:) = CMPLX(0.d0,0.d0,kind=dp)
      !
-     gaux(nl(:)) = g(ipol,:) * CMPLX( -AIMAG( a(:) ), REAL( a(:) ) ,kind=DP)
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(a,1), ubound(a,1)
+       gaux_d(nl(i)) = g(ipol,i) * CMPLX( -AIMAG( a(i) ), REAL( a(i) ) ,kind=DP)
+     end do
      !
      IF ( gamma_only ) THEN
+        print*, "gradrho_gpu: GAMMA_ONLY not implemented!"
+        flush(6); stop
         !
-        gaux(nlm(:)) = CMPLX( REAL( gaux(nl(:)) ), -AIMAG( gaux(nl(:)) ) ,kind=DP)
+        !gaux(nlm(:)) = CMPLX( REAL( gaux(nl(:)) ), -AIMAG( gaux(nl(:)) ) ,kind=DP)
         !
      END IF
      !
      ! ... bring back to R-space, (\grad_ipol a)(r) ...
      !
-     CALL invfft ('Dense', gaux, dfftp)
+     CALL invfft ('Dense', gaux_d, dfftp)
      !
      ! ...and add the factor 2\pi/a  missing in the definition of G
      !
-     ga(ipol,:) = ga(ipol,:) + tpiba * REAL( gaux(:) )
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(ga,2), ubound(ga,2)
+       ga(ipol,i) = ga(ipol,i) + tpiba * REAL( gaux_d(i) )
+     end do
      !
   END DO
   !
-  DEALLOCATE( gaux )
+  DEALLOCATE( gaux_d )
   !
   RETURN
   !
@@ -1512,54 +1681,65 @@ SUBROUTINE grad_dot_gpu( nrxx, a, ngm, g, nl, alat, da )
   !
   IMPLICIT NONE
   !
-  INTEGER,  INTENT(IN)     :: nrxx, ngm, nl(ngm)
-  REAL(DP), INTENT(IN)     :: a(3,nrxx), g(3,ngm), alat
-  REAL(DP), INTENT(OUT)    :: da(nrxx)
+  INTEGER,  INTENT(IN)     :: nrxx, ngm
+  INTEGER,  INTENT(IN), device     :: nl(ngm)
+  REAL(DP), INTENT(IN)     ::  alat
+  REAL(DP), INTENT(IN), device     :: a(3,nrxx), g(3,ngm)
+  REAL(DP), INTENT(OUT), device    :: da(nrxx)
   !
-  INTEGER                  :: n, ipol
-  COMPLEX(DP), ALLOCATABLE :: aux(:), gaux(:)
+  INTEGER                  :: n, ipol, i
+  COMPLEX(DP), ALLOCATABLE, device :: aux_d(:), gaux_d(:)
   !
   !
-  ALLOCATE( aux( nrxx ), gaux( nrxx ) )
+  ALLOCATE( aux_d( nrxx ), gaux_d( nrxx ) )
   !
-  gaux(:) = CMPLX(0.d0,0.d0, kind=dp)
+  gaux_d(:) = CMPLX(0.d0,0.d0, kind=dp)
   !
   DO ipol = 1, 3
      !
-     aux = CMPLX( a(ipol,:), 0.D0 ,kind=DP)
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(a,2), ubound(a,2)
+       aux_d(i) = CMPLX( a(ipol,i), 0.D0 ,kind=DP)
+     end do
      !
      ! ... bring a(ipol,r) to G-space, a(G) ...
      !
-     CALL fwfft ('Dense', aux, dfftp)
+     CALL fwfft ('Dense', aux_d, dfftp)
      !
+     !$cuf kernel do(1) <<<*,*>>>
      DO n = 1, ngm
         !
-        gaux(nl(n)) = gaux(nl(n)) + g(ipol,n) * &
-                      CMPLX( -AIMAG( aux(nl(n)) ), REAL( aux(nl(n)) ) ,kind=DP)
+        gaux_d(nl(n)) = gaux_d(nl(n)) + g(ipol,n) * &
+                      CMPLX( -AIMAG( aux_d(nl(n)) ), REAL( aux_d(nl(n)) ) ,kind=DP)
         !
      END DO
     !
   END DO
   !
   IF ( gamma_only ) THEN
+     print*, "grad_dot_gpu: GAMMA_ONLY not implemented!!"
+     flush(6); stop
      !
-     DO n = 1, ngm
-        !
-        gaux(nlm(n)) = CONJG( gaux(nl(n)) )
-        !
-     END DO
+     !DO n = 1, ngm
+     !   !
+     !   gaux(nlm(n)) = CONJG( gaux(nl(n)) )
+     !   !
+     !END DO
      !
   END IF
   !
   ! ... bring back to R-space, (\grad_ipol a)(r) ...
   !
-  CALL invfft ('Dense', gaux, dfftp)
+  CALL invfft ('Dense', gaux_d, dfftp)
   !
   ! ... add the factor 2\pi/a  missing in the definition of G and sum
   !
-  da(:) = tpiba * REAL( gaux(:) )
+  !$cuf kernel do(1) <<<*,*>>>
+  do i = lbound(da,1), ubound(da,1)
+    da(i) = tpiba * REAL( gaux_d(i) )
+  end do
   !
-  DEALLOCATE( aux, gaux )
+  DEALLOCATE( aux_d, gaux_d )
   !
   RETURN
   !
diff --git a/PW/src/scf_mod.f90 b/PW/src/scf_mod.f90
index 296bf62dd..9b1f057d2 100644
--- a/PW/src/scf_mod.f90
+++ b/PW/src/scf_mod.f90
@@ -66,7 +66,7 @@ MODULE scf
      REAL(DP)                 :: el_dipole  ! electrons dipole
   END TYPE mix_type
 
-  type (scf_type) :: rho  ! the charge density and its other components
+  type (scf_type), TARGET :: rho  ! the charge density and its other components
   type (scf_type) :: v    ! the scf potential
   type (scf_type) :: vnew ! used to correct the forces
 
@@ -83,7 +83,7 @@ MODULE scf
        rho_core_d(:),    &! the core charge in real space
        kedtau_d(:,:)      ! position dependent kinetic energy enhancement factor
 
-  COMPLEX(DP), ALLOCATABLE :: &
+  COMPLEX(DP), DEVICE, ALLOCATABLE :: &
        rhog_core_d(:)     ! the core charge in reciprocal space
 
 #endif
@@ -220,7 +220,11 @@ subroutine assign_scf_to_mix_type(rho_s, rho_m)
  end subroutine assign_scf_to_mix_type
  !
  subroutine assign_mix_to_scf_type(rho_m, rho_s)
+#ifdef USE_CUDA
+   USE wavefunctions_module, ONLY : psic => psic_d
+#else
    USE wavefunctions_module, ONLY : psic
+#endif
    USE control_flags,        ONLY : gamma_only
    USE gvect,                ONLY : nl, nlm
    IMPLICIT NONE
@@ -337,7 +341,11 @@ subroutine mix_type_SCAL (A,X)
  end subroutine mix_type_SCAL
  !
  subroutine high_frequency_mixing ( rhoin, input_rhout, alphamix )
+#ifdef USE_CUDA
+   USE wavefunctions_module, ONLY : psic => psic_d
+#else
    USE wavefunctions_module, ONLY : psic
+#endif
    USE control_flags,        ONLY : gamma_only
    USE gvect,                ONLY : nl, nlm
  IMPLICIT NONE
@@ -383,7 +391,6 @@ subroutine high_frequency_mixing ( rhoin, input_rhout, alphamix )
    return
  end subroutine high_frequency_mixing 
 
-
  subroutine open_mix_file( iunit, extension, exst )
    USE control_flags,        ONLY : io_level
    implicit none
diff --git a/PW/src/set_vrs.f90 b/PW/src/set_vrs.f90
index afc19c347..b6341f13a 100644
--- a/PW/src/set_vrs.f90
+++ b/PW/src/set_vrs.f90
@@ -115,3 +115,84 @@ subroutine interpolate_vrs ( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
   return
 
 end subroutine interpolate_vrs
+
+#ifdef USE_CUDA
+!--------------------------------------------------------------------
+subroutine sum_vrs_gpu ( nrxx, nspin, vltot, vr, vrs )
+  !--------------------------------------------------------------------
+  ! accumulates local potential contributions in to vrs 
+  !
+  USE kinds
+  !
+  implicit none
+
+  integer :: nspin, nrxx
+  ! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
+  ! input: the fft grid dimension
+  real(DP), device :: vrs (nrxx, nspin), vltot (nrxx), vr (nrxx, nspin)
+  ! output: total local potential on the smooth grid
+  !         vrs=vltot+vr
+  ! input: the total local pseudopotential
+  ! input: the scf(H+xc) part of the local potential
+
+  integer:: is, i
+
+  do is = 1, nspin
+     !
+     ! define the total local potential (external + scf) for each spin ...
+     !
+     if (is > 1 .and. nspin == 4) then
+        !
+        ! noncolinear case: only the first component contains vltot
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        do i = lbound(vrs,1), ubound(vrs, 1)
+          vrs (i, is) = vr (i, is)
+        end do
+     else
+        !$cuf kernel do(1) <<<*,*>>>
+        do i = lbound(vrs,1), ubound(vrs, 1)
+          vrs (i, is) = vltot (i) + vr (i, is)
+        end do
+     end if
+     !
+  enddo
+  return
+
+end subroutine sum_vrs_gpu
+!
+!--------------------------------------------------------------------
+subroutine interpolate_vrs_gpu ( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
+  !--------------------------------------------------------------------
+  ! set the total local potential vrs on the smooth mesh to be used in 
+  ! h_psi, adding the (spin dependent) scf (H+xc) part and the sum of 
+  ! all the local pseudopotential contributions.
+  !
+  USE kinds
+  USE funct, only : dft_is_meta
+  USE fft_base, only : dffts 
+  implicit none
+
+  integer :: nspin, nrxx
+  ! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
+  ! input: the fft grid dimension
+  real(DP) :: kedtau(dffts%nnr,nspin), kedtaur(nrxx,nspin)
+  real(DP), device :: vrs (nrxx, nspin)
+  ! output: total local potential interpolated on the smooth grid
+  ! input: the scf(H+xc) part of the local potential
+  logical :: doublegrid
+  ! input: true if a doublegrid is used
+
+  integer:: is
+
+  do is = 1, nspin
+     !
+     ! ... and interpolate it on the smooth mesh if necessary
+     !
+     if (doublegrid) call interpolate_gpu (vrs (1, is), vrs (1, is), - 1)
+     if (dft_is_meta()) call interpolate(kedtaur(1,is),kedtau(1,is),-1)
+  enddo
+  return
+
+end subroutine interpolate_vrs_gpu
+#endif
diff --git a/PW/src/sum_band.f90 b/PW/src/sum_band.f90
index 33ae7d8e3..fc44776f9 100644
--- a/PW/src/sum_band.f90
+++ b/PW/src/sum_band.f90
@@ -1280,6 +1280,7 @@ SUBROUTINE sum_band_gpu()
              ik,   &! counter on k points
              ibnd_start, ibnd_end, this_bgrp_nbnd ! first, last and number of band in this bgrp
   REAL (DP), ALLOCATABLE :: kplusg (:)
+  integer :: i
   !
   !
   CALL start_clock( 'sum_band' )
@@ -1405,10 +1406,23 @@ SUBROUTINE sum_band_gpu()
   !
   ! ... bring the (unsymmetrized) rho(r) to G-space (use psic as work array)
   !
+  !DO is = 1, nspin
+  !   psic(:) = rho%of_r(:,is)
+  !   CALL fwfft ('Dense', psic, dfftp)
+  !   rho%of_g(:,is) = psic(nl(:))
+  !END DO
+
+  !rho%of_r_d(:,:) = rho%of_r(:,:)
+
   DO is = 1, nspin
-     psic(:) = rho%of_r(:,is)
-     CALL fwfft ('Dense', psic, dfftp)
-     rho%of_g(:,is) = psic(nl(:))
+     psic_d(:) = rho%of_r(:,is) !JR: This memcopy is very slow
+     !!$cuf kernel do(1) <<<*, *>>>
+     !do i = lbound(psic_d,1), ubound(psic_d,1)
+     !  psic_d(i) = rho%of_r_d(i,is)
+     !end do
+
+     CALL fwfft ('Dense', psic_d, dfftp)
+     rho%of_g(:,is) = psic_d(nl(:))
   END DO
   !
   ! ... symmetrize rho(G)
@@ -1430,11 +1444,11 @@ SUBROUTINE sum_band_gpu()
   !
   DO is = 1, nspin_mag
      !
-     psic(:) = ( 0.D0, 0.D0 )
-     psic(nl(:)) = rho%of_g(:,is)
-     IF ( gamma_only ) psic(nlm(:)) = CONJG( rho%of_g(:,is) )
-     CALL invfft ('Dense', psic, dfftp)
-     rho%of_r(:,is) = psic(:)
+     psic_d(:) = ( 0.D0, 0.D0 )
+     psic_d(nl(:)) = rho%of_g(:,is)
+     IF ( gamma_only ) psic_d(nlm(:)) = CONJG( rho%of_g(:,is) )
+     CALL invfft ('Dense', psic_d, dfftp)
+     rho%of_r(:,is) = psic_d(:)
      !
   END DO
   !
diff --git a/PW/src/v_of_rho.f90 b/PW/src/v_of_rho.f90
index 314ac2e62..b8c2cb213 100644
--- a/PW/src/v_of_rho.f90
+++ b/PW/src/v_of_rho.f90
@@ -513,7 +513,7 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
   !
   ! ... add gradient corrections (if any)
   !
-  CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v )
+    CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v )
   !
   ! ... add non local corrections (if any)
   !
@@ -1198,10 +1198,8 @@ SUBROUTINE gradv_h_of_rho_r( rho, gradv )
 END SUBROUTINE gradv_h_of_rho_r
 
 #ifdef USE_CUDA
-!#if 0
-
-SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, &
-                     ehart, etxc, vtxc, eth, etotefield, charge, v )
+SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, &
+                     rhog_core_d, ehart, etxc, vtxc, eth, etotefield, charge, v )
   !----------------------------------------------------------------------------
   !
   ! ... This routine computes the Hartree and Exchange and Correlation
@@ -1229,15 +1227,11 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, &
   !!!!!!!!!!!!!!!!! not just OUT because otherwise their allocatable or pointer
   !!!!!!!!!!!!!!!!! components are NOT defined !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   REAL(DP), INTENT(IN) :: rho_core(dfftp%nnr)
+  REAL(DP), INTENT(IN), device :: rho_core_d(dfftp%nnr)
     ! the core charge
   COMPLEX(DP), INTENT(IN) :: rhog_core(ngm)
+  COMPLEX(DP), INTENT(IN), device :: rhog_core_d(ngm)
     ! the core charge in reciprocal space
-
-  REAL(DP), device, INTENT(IN) :: rho_core_d(dfftp%nnr)
-    ! the core charge
-  COMPLEX(DP), device, INTENT(IN) :: rhog_core_d(ngm)
-    ! the core charge in reciprocal space
-
   REAL(DP), INTENT(OUT) :: vtxc, etxc, ehart, eth, charge
     ! the integral V_xc * rho
     ! the E_xc energy
@@ -1254,20 +1248,30 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, &
   ! ... calculate exchange-correlation potential
   !
   if (dft_is_meta()) then
-     print *,"dft_is_meta NOT implemented!!!!"
-     STOP
-     !call v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r )
+     print*, "v_of_rho_gpu: meta not implemented!"; flush(6); stop
+     call v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r )
   else
-     CALL v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v%of_r, v%of_r_d )
+     CALL v_xc_gpu( rho, rho_core_d, rhog_core_d, etxc, vtxc, v%of_r_d )
   endif
   !
   ! ... add a magnetic field  (if any)
   !
+  !JR Need to uncomment memcopies if bfield present
+  !v%of_r = v%of_r_d
   CALL add_bfield( v%of_r, rho%of_r )
+  !v%of_r_d = v%of_r
+
   !
   ! ... calculate hartree potential
   !
-  CALL v_h( rho%of_g, ehart, charge, v%of_r )
+  !JR Note: Using v_h_gpu does impact last printed digits of total and HF estimate energy results.
+  CALL v_h_gpu(rho%of_g_d, ehart, charge, v%of_r_d )
+  v%of_r = v%of_r_d
+
+  !JR Replace with CPU call here if desired
+  !v%of_r = v%of_r_d
+  !CALL v_h(rho%of_g, ehart, charge, v%of_r )
+
   !
   ! ... LDA+U: build up Hubbard potential 
   !
@@ -1301,6 +1305,7 @@ SUBROUTINE v_of_rho_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, &
   RETURN
   !
 END SUBROUTINE v_of_rho_gpu
+!
 !----------------------------------------------------------------------------
 SUBROUTINE v_xc_meta_gpu( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
   !----------------------------------------------------------------------------
@@ -1526,8 +1531,8 @@ SUBROUTINE v_xc_meta_gpu( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
   RETURN
   !
 END SUBROUTINE v_xc_meta_gpu
-!
-SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d )
+
+SUBROUTINE v_xc_gpu( rho, rho_core_d, rhog_core_d,  etxc, vtxc, v_d )
   !----------------------------------------------------------------------------
   !
   ! ... Exchange-Correlation potential Vxc(r) from n(r)
@@ -1540,39 +1545,28 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt
   USE lsda_mod,         ONLY : nspin
   USE cell_base,        ONLY : omega
   USE spin_orb,         ONLY : domag
-  USE funct,            ONLY : xc, xc_spin, nlc, dft_is_nonlocc
+  USE funct,            ONLY : xc, xc_spin, nlc, dft_is_nonlocc, get_iexch, get_icorr
   USE scf,              ONLY : scf_type
   USE mp_bands,         ONLY : intra_bgrp_comm
   USE mp,               ONLY : mp_sum
 
-  USE cudafor
-
   !
   IMPLICIT NONE
   !
-  TYPE (scf_type), INTENT(INOUT) :: rho
-  REAL(DP), INTENT(IN) :: rho_core(dfftp%nnr)
-    ! the core charge
-  COMPLEX(DP), INTENT(IN) :: rhog_core(ngm)
-    ! input: the core charge in reciprocal space
-  REAL(DP), INTENT(OUT) :: v(dfftp%nnr,nspin), vtxc, etxc
-    ! V_xc potential
-    ! integral V_xc * rho
-    ! E_xc energy
-
-  REAL(DP), DEVICE, INTENT(IN) :: rho_core_d(dfftp%nnr)
+  TYPE (scf_type), INTENT(INOUT), target :: rho
+  REAL(DP), INTENT(INOUT), device :: rho_core_d(dfftp%nnr)
     ! the core charge
-  COMPLEX(DP), DEVICE, INTENT(IN) :: rhog_core_d(ngm)
+  COMPLEX(DP), INTENT(IN), device :: rhog_core_d(ngm)
     ! input: the core charge in reciprocal space
-  REAL(DP), DEVICE, INTENT(OUT) :: v_d(dfftp%nnr,nspin)
+  REAL(DP), INTENT(OUT) :: vtxc, etxc
+  REAL(DP), INTENT(OUT), device :: v_d(dfftp%nnr,nspin)
     ! V_xc potential
     ! integral V_xc * rho
     ! E_xc energy
-
   !
   ! ... local variables
   !
-  REAL(DP) :: rhox, arhox, zeta, amag, vs, ex, ec, vx(2), vc(2), rhoneg(2)
+  REAL(DP) :: rhox, arhox, zeta, amag, vs, ex, ec, vx(2), vc(2), rhoneg(2), vx1, vc1, rhoneg1
     ! the total charge in each point
     ! the absolute value of the charge
     ! the absolute value of the charge
@@ -1586,160 +1580,218 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt
   !
   REAL(DP), PARAMETER :: vanishing_charge = 1.D-10, &
                          vanishing_mag    = 1.D-20
+
+  integer :: iexch, icorr
+  real(DP), pointer, device :: rho_of_r_d(:,:)
+
+  ! parameters from expanded subroutines from xc
+  real(DP), parameter :: small = 1.E-10_DP,  third = 1.0_DP / 3.0_DP, &
+       pi34 = 0.6203504908994_DP  ! pi34=(3/4pi)^(1/3)
+  real(DP) :: rs
+  real(dp), parameter  :: f= -0.687247939924714d0, alpha = 2.0d0/3.0d0
+  ! f = -9/8*(3/2pi)^(2/3)
+  real(DP) :: a, b1, b2, c0, c1, c2, c3, d0, d1
+  parameter (a = 0.031091d0, b1 = 7.5957d0, b2 = 3.5876d0, c0 = a, &
+       c1 = 0.046644d0, c2 = 0.00664d0, c3 = 0.01043d0, d0 = 0.4335d0, &
+       d1 = 1.4408d0)
+  real(DP) :: lnrs, rs12, rs32, rs2, om, dom, olog
+  real(DP) :: a11, b31, b41
+  parameter (a11 = 0.21370d0, b31 = 1.6382d0, b41 = 0.49294d0)
   !
   !
   CALL start_clock( 'v_xc' )
   !
   etxc   = 0.D0
   vtxc   = 0.D0
-
-  v(:,:) = 0.D0
-  !v_d(:,:) = 0.D0
-
+  v_d(:,:) = 0.D0
   rhoneg = 0.D0
   !
-
-  !
+  CALL start_clock( 'spin' )
   IF ( nspin == 1 .OR. ( nspin == 4 .AND. .NOT. domag ) ) THEN
-     !
-     ! ... spin-unpolarized case
-     !
-!$omp parallel do private( rhox, arhox, ex, ec, vx, vc ), &
-!$omp             reduction(+:etxc,vtxc), reduction(-:rhoneg)
-     DO ir = 1, dfftp%nnr
-        !
-        rhox = rho%of_r(ir,1) + rho_core(ir)
-        !
-        arhox = ABS( rhox )
-        !
-        IF ( arhox > vanishing_charge ) THEN
-           !
-           CALL xc( arhox, ex, ec, vx(1), vc(1) )
-           !
-           v(ir,1) = e2*( vx(1) + vc(1) )
-           !
-           etxc = etxc + e2*( ex + ec ) * rhox
-           !
-           vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
-           !
-        ENDIF
-        !
-        IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
-        !
-     END DO
-!$omp end parallel do
+     iexch = get_iexch
+     icorr = get_icorr
 
-#if 0
-!$cuf kernel do(1) <<<*,*>>>
-     DO ir = 1, dfftp%nnr
-        !
-        rhox = rho%of_r_d(ir,1) + rho_core_d(ir)
-        !
-        arhox = ABS( rhox )
-        !
-        IF ( arhox > vanishing_charge ) THEN
-           !
-           CALL xc( arhox, ex, ec, vx(1), vc(1) )
-           !
-           v(ir,1) = e2*( vx(1) + vc(1) )
-           !
-           etxc = etxc + e2*( ex + ec ) * rhox
-           !
-           vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
-           !
-        ENDIF
-        !
-        IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
-        !
-     END DO
-#endif
+     IF (iexch .eq. 1 .and. icorr .eq. 4) THEN
+
+       rho_of_r_d => rho%of_r_d
+
+       !
+       ! ... spin-unpolarized case
+       !
+
+       !$cuf kernel do (1) <<<*,*>>>
+       DO ir = 1, dfftp%nnr
+          !
+          rhox = rho_of_r_d(ir,1) + rho_core_d(ir)
+          !
+          arhox = ABS( rhox )
+          !
+          IF ( arhox > vanishing_charge ) THEN
+            if (arhox <= small) then
+               ec = 0.0_DP
+               vc1 = 0.0_DP
+               ex = 0.0_DP
+               vx1 = 0.0_DP
+            else
+               rs = pi34 / arhox**third
+               !
+               !CALL xc( arhox, ex, ec, vx(1), vc(1) )
+               !call slater (rs, ex, vx(1))
+               ex = f * alpha / rs
+               vx1 = 4.d0 / 3.d0 * f * alpha / rs
+
+               !call pw (rs, 1, ec, vc(1))
+               ! interpolation formula
+               rs12 = sqrt (rs)
+               rs32 = rs * rs12
+               rs2 = rs**2
+               om = 2.d0 * a * (b1 * rs12 + b2 * rs + b31 * rs32 + b41 &
+                    * rs2)
+               dom = 2.d0 * a * (0.5d0 * b1 * rs12 + b2 * rs + 1.5d0 * b31 &
+                    * rs32 + 2.d0 * b41 * rs2)
+               olog = log (1.d0 + 1.0d0 / om)
+               ec = - 2.d0 * a * (1.d0 + a11 * rs) * olog
+               vc1 = - 2.d0 * a * (1.d0 + 2.d0 / 3.d0 * a11 * rs) &
+                    * olog - 2.d0 / 3.d0 * a * (1.d0 + a11 * rs) * dom / &
+                    (om * (om + 1.d0) )
+             endif
+             !
+             v_d(ir,1) = e2*( vx1 + vc1 )
+             !
+             etxc = etxc + e2*( ex + ec ) * rhox
+             !
+             vtxc = vtxc + v_d(ir,1) * rho_of_r_d(ir,1)
+             !
+          ENDIF
+          !
+          IF ( rho_of_r_d(ir,1) < 0.D0 ) rhoneg1 = rhoneg1 - rho_of_r_d(ir,1)
+          !
+       END DO
+
+       rhoneg(1) = rhoneg1 ! copy reduced variable
+     ELSE
+       print*, "v_xc_gpu: Should not be here!"; flush(6); stop
+
+!       !
+!       ! ... spin-unpolarized case
+!       !
+!!$omp parallel do private( rhox, arhox, ex, ec, vx, vc ), &
+!!$omp             reduction(+:etxc,vtxc), reduction(-:rhoneg)
+!       DO ir = 1, dfftp%nnr
+!          !
+!          rhox = rho%of_r(ir,1) + rho_core(ir)
+!          !
+!          arhox = ABS( rhox )
+!          !
+!          IF ( arhox > vanishing_charge ) THEN
+!             !
+!             CALL xc( arhox, ex, ec, vx(1), vc(1) )
+!             !
+!             v(ir,1) = e2*( vx(1) + vc(1) )
+!             !
+!             etxc = etxc + e2*( ex + ec ) * rhox
+!             !
+!             vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
+!             !
+!          ENDIF
+!          !
+!          IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
+!          !
+!       END DO
+!!$omp end parallel do
+     END IF
      !
   ELSE IF ( nspin == 2 ) THEN
      !
      ! ... spin-polarized case
      !
-!$omp parallel do private( rhox, arhox, zeta, ex, ec, vx, vc ), &
-!$omp             reduction(+:etxc,vtxc), reduction(-:rhoneg)
-     DO ir = 1, dfftp%nnr
-        !
-        rhox = rho%of_r(ir,1) + rho%of_r(ir,2) + rho_core(ir)
-        !
-        arhox = ABS( rhox )
-        !
-        IF ( arhox > vanishing_charge ) THEN
-           !
-           zeta = ( rho%of_r(ir,1) - rho%of_r(ir,2) ) / arhox
-           !
-           IF ( ABS( zeta ) > 1.D0 ) zeta = SIGN( 1.D0, zeta )
-           !
-           IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
-           IF ( rho%of_r(ir,2) < 0.D0 ) rhoneg(2) = rhoneg(2) - rho%of_r(ir,2)
-           !
-           CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) )
-           !
-           v(ir,:) = e2*( vx(:) + vc(:) )
-           !
-           etxc = etxc + e2*( ex + ec ) * rhox
-           !
-           vtxc = vtxc + ( v(ir,1)*rho%of_r(ir,1) + v(ir,2)*rho%of_r(ir,2) )
-           !
-        END IF
-        !
-     END DO
-!$omp end parallel do
+     print*,  "v_xc_gpu: SPIN POLOARIZED not supported!"; flush(6); STOP
+
+!!$omp parallel do private( rhox, arhox, zeta, ex, ec, vx, vc ), &
+!!$omp             reduction(+:etxc,vtxc), reduction(-:rhoneg)
+!     DO ir = 1, dfftp%nnr
+!        !
+!        rhox = rho%of_r(ir,1) + rho%of_r(ir,2) + rho_core(ir)
+!        !
+!        arhox = ABS( rhox )
+!        !
+!        IF ( arhox > vanishing_charge ) THEN
+!           !
+!           zeta = ( rho%of_r(ir,1) - rho%of_r(ir,2) ) / arhox
+!           !
+!           IF ( ABS( zeta ) > 1.D0 ) zeta = SIGN( 1.D0, zeta )
+!           !
+!           IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
+!           IF ( rho%of_r(ir,2) < 0.D0 ) rhoneg(2) = rhoneg(2) - rho%of_r(ir,2)
+!           !
+!           CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) )
+!           !
+!           v(ir,:) = e2*( vx(:) + vc(:) )
+!           !
+!           etxc = etxc + e2*( ex + ec ) * rhox
+!           !
+!           vtxc = vtxc + ( v(ir,1)*rho%of_r(ir,1) + v(ir,2)*rho%of_r(ir,2) )
+!           !
+!        END IF
+!        !
+!     END DO
+!!$omp end parallel do
      !
   ELSE IF ( nspin == 4 ) THEN
      !
      ! ... noncolinear case
      !
-     DO ir = 1,dfftp%nnr
-        !
-        amag = SQRT( rho%of_r(ir,2)**2 + rho%of_r(ir,3)**2 + rho%of_r(ir,4)**2 )
-        !
-        rhox = rho%of_r(ir,1) + rho_core(ir)
-        !
-        IF ( rho%of_r(ir,1) < 0.D0 )  rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
-        !
-        arhox = ABS( rhox )
-        !
-        IF ( arhox > vanishing_charge ) THEN
-           !
-           zeta = amag / arhox
-           !
-           IF ( ABS( zeta ) > 1.D0 ) THEN
-              !
-              rhoneg(2) = rhoneg(2) + 1.D0 / omega
-              !
-              zeta = SIGN( 1.D0, zeta )
-              !
-           END IF
-           !
-           CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) )
-           !
-           vs = 0.5D0*( vx(1) + vc(1) - vx(2) - vc(2) )
-           !
-           v(ir,1) = e2*( 0.5D0*( vx(1) + vc(1) + vx(2) + vc(2 ) ) )
-           !
-           IF ( amag > vanishing_mag ) THEN
-              !
-              DO ipol = 2, 4
-                 !
-                 v(ir,ipol) = e2 * vs * rho%of_r(ir,ipol) / amag
-                 !
-                 vtxc = vtxc + v(ir,ipol) * rho%of_r(ir,ipol)
-                 !
-              END DO
-              !
-           END IF
-           !
-           etxc = etxc + e2*( ex + ec ) * rhox
-           vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
-           !
-        END IF
-        !
-     END DO
+     print*,  "v_xc_gpu: NONCOLINEAR not supported!"; flush(6); STOP
+
+!     DO ir = 1,dfftp%nnr
+!        !
+!        amag = SQRT( rho%of_r(ir,2)**2 + rho%of_r(ir,3)**2 + rho%of_r(ir,4)**2 )
+!        !
+!        rhox = rho%of_r(ir,1) + rho_core(ir)
+!        !
+!        IF ( rho%of_r(ir,1) < 0.D0 )  rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
+!        !
+!        arhox = ABS( rhox )
+!        !
+!        IF ( arhox > vanishing_charge ) THEN
+!           !
+!           zeta = amag / arhox
+!           !
+!           IF ( ABS( zeta ) > 1.D0 ) THEN
+!              !
+!              rhoneg(2) = rhoneg(2) + 1.D0 / omega
+!              !
+!              zeta = SIGN( 1.D0, zeta )
+!              !
+!           END IF
+!           !
+!           CALL xc_spin( arhox, zeta, ex, ec, vx(1), vx(2), vc(1), vc(2) )
+!           !
+!           vs = 0.5D0*( vx(1) + vc(1) - vx(2) - vc(2) )
+!           !
+!           v(ir,1) = e2*( 0.5D0*( vx(1) + vc(1) + vx(2) + vc(2 ) ) )
+!           !
+!           IF ( amag > vanishing_mag ) THEN
+!              !
+!              DO ipol = 2, 4
+!                 !
+!                 v(ir,ipol) = e2 * vs * rho%of_r(ir,ipol) / amag
+!                 !
+!                 vtxc = vtxc + v(ir,ipol) * rho%of_r(ir,ipol)
+!                 !
+!              END DO
+!              !
+!           END IF
+!           !
+!           etxc = etxc + e2*( ex + ec ) * rhox
+!           vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
+!           !
+!        END IF
+!        !
+!     END DO
      !
   END IF
+  CALL stop_clock( 'spin' )
   !
   CALL mp_sum(  rhoneg , intra_bgrp_comm )
   !
@@ -1755,16 +1807,15 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt
   !
   ! ... add gradient corrections (if any)
   !
-  !rho%of_r_d = rho%of_r
-  !rho%of_g_d = rho%of_g
-  !rho_core_d = rho_core
-  !rhog_core_d = rhog_core
-  !v_d = v
-  CALL gradcorr_gpu( rho%of_r, rho%of_r_d, rho%of_g, rho%of_g_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d )
+  !CALL gradcorr_gpu( rho%of_r, rho%of_r_d, rho%of_g, rho%of_g_d, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vtxc, v, v_d )
+  CALL gradcorr_gpu(rho%of_r_d, rho%of_g_d, rho_core_d, rhog_core_d, etxc, vtxc, v_d )
   !
   ! ... add non local corrections (if any)
   !
-  IF ( dft_is_nonlocc() ) CALL nlc( rho%of_r, rho_core, nspin, etxc, vtxc, v )
+  IF ( dft_is_nonlocc() ) THEN
+    print*, "v_xc_gpu: dft_is_nonlocc not implemented!"; flush(6); stop
+    !CALL nlc( rho%of_r, rho_core, nspin, etxc, vtxc, v )
+  END IF
   !
   CALL mp_sum(  vtxc , intra_bgrp_comm )
   CALL mp_sum(  etxc , intra_bgrp_comm )
@@ -1776,7 +1827,7 @@ SUBROUTINE v_xc_gpu( rho, rho_core, rho_core_d, rhog_core, rhog_core_d, etxc, vt
 END SUBROUTINE v_xc_gpu
 !
 !----------------------------------------------------------------------------
-SUBROUTINE v_h_gpu( rhog, ehart, charge, v )
+SUBROUTINE v_h_gpu( rhog_d, ehart, charge, v_d )
   !----------------------------------------------------------------------------
   !
   ! ... Hartree potential VH(r) from n(G)
@@ -1785,7 +1836,7 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v )
   USE kinds,     ONLY : DP
   USE fft_base,  ONLY : dfftp
   USE fft_interfaces,ONLY : invfft
-  USE gvect,     ONLY : nl, nlm, ngm, gg, gstart
+  USE gvect,     ONLY : nl, nl_d, nlm, ngm, gg, gg_d, gstart
   USE lsda_mod,  ONLY : nspin
   USE cell_base, ONLY : omega, tpiba2
   USE control_flags, ONLY : gamma_only
@@ -1796,27 +1847,33 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v )
   !
   IMPLICIT NONE
   !
-  COMPLEX(DP), INTENT(IN)  :: rhog(ngm,nspin)
-  REAL(DP),  INTENT(INOUT) :: v(dfftp%nnr,nspin)
+  COMPLEX(DP), INTENT(IN), device  :: rhog_d(ngm,nspin)
+  REAL(DP),  INTENT(INOUT), device :: v_d(dfftp%nnr,nspin)
   REAL(DP),    INTENT(OUT) :: ehart, charge
   !
   REAL(DP)              :: fac
-  REAL(DP), ALLOCATABLE :: aux1(:,:)
+  REAL(DP), ALLOCATABLE, device :: aux1_d(:,:)
   REAL(DP)              :: rgtot_re, rgtot_im, eh_corr
   INTEGER               :: is, ig
-  COMPLEX(DP), ALLOCATABLE :: aux(:), rgtot(:), vaux(:)
-  INTEGER               :: nt
+  COMPLEX(DP), ALLOCATABLE :: rgtot(:), vaux(:)
+  COMPLEX(DP), ALLOCATABLE, DEVICE :: aux_d(:)
+  INTEGER               :: nt, i
+  complex(DP) :: rhog11, rhog12
   !
   CALL start_clock( 'v_h' )
   !
-  ALLOCATE( aux( dfftp%nnr ), aux1( 2, ngm ) )
+  ALLOCATE( aux_d( dfftp%nnr ), aux1_d(2, ngm))
   charge = 0.D0
   !
   IF ( gstart == 2 ) THEN
      !
-     charge = omega*REAL( rhog(1,1) )
+     rhog11 = rhog_d(1,1)
+     charge = omega*REAL( rhog11 )
      !
-     IF ( nspin == 2 ) charge = charge + omega*REAL( rhog(1,2) )
+     IF ( nspin == 2 ) THEN
+       rhog12 = rhog_d(1,2)
+       charge = charge + omega*REAL( rhog12 )
+     END IF
      !
   END IF
   !
@@ -1825,44 +1882,48 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v )
   ! ... calculate hartree potential in G-space (NB: V(G=0)=0 )
   !
   IF ( do_comp_esm .and. ( esm_bc .ne. 'pbc' ) ) THEN
+    print*, "vh_gpu: do_comp_esm not implemented!"; flush(6); stop
      !
      ! ... calculate modified Hartree potential for ESM
      !
-     CALL esm_hartree (rhog, ehart, aux)
+     !CALL esm_hartree (rhog, ehart, aux)
      !
   ELSE
      !
      ehart     = 0.D0
-     aux1(:,:) = 0.D0
+     aux1_d(:,:) = 0.D0
      !
-!$omp parallel do private( fac, rgtot_re, rgtot_im ), reduction(+:ehart)
+     !$cuf kernel do(1) <<<*,*>>>
      DO ig = gstart, ngm
         !
-        fac = 1.D0 / gg(ig)
+        fac = 1.D0 / gg_d(ig)
         !
-        rgtot_re = REAL(  rhog(ig,1) )
-        rgtot_im = AIMAG( rhog(ig,1) )
+        rgtot_re = REAL(  rhog_d(ig,1) )
+        rgtot_im = AIMAG( rhog_d(ig,1) )
         !
         IF ( nspin == 2 ) THEN
            !
-           rgtot_re = rgtot_re + REAL(  rhog(ig,2) )
-           rgtot_im = rgtot_im + AIMAG( rhog(ig,2) )
+           rgtot_re = rgtot_re + REAL(  rhog_d(ig,2) )
+           rgtot_im = rgtot_im + AIMAG( rhog_d(ig,2) )
            !
         END IF
         !
         ehart = ehart + ( rgtot_re**2 + rgtot_im**2 ) * fac
         !
-        aux1(1,ig) = rgtot_re * fac
-        aux1(2,ig) = rgtot_im * fac
+        aux1_d(1,ig) = rgtot_re * fac
+        aux1_d(2,ig) = rgtot_im * fac
         !
      ENDDO
-!$omp end parallel do
      !
      fac = e2 * fpi / tpiba2
      !
      ehart = ehart * fac
      !
-     aux1 = aux1 * fac
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(aux1_d,2), ubound(aux1_d,2)
+       aux1_d(1,i) = aux1_d(1,i) * fac
+       aux1_d(2,i) = aux1_d(2,i) * fac
+     end do
      !
      IF ( gamma_only ) THEN
         !
@@ -1875,50 +1936,60 @@ SUBROUTINE v_h_gpu( rhog, ehart, charge, v )
      END IF
      ! 
      if (do_comp_mt) then
-        ALLOCATE( vaux( ngm ), rgtot(ngm) )
-        rgtot(:) = rhog(:,1)
-        if (nspin==2) rgtot(:) = rgtot(:) + rhog(:,2)
-        CALL wg_corr_h (omega, ngm, rgtot, vaux, eh_corr)
-        aux1(1,1:ngm) = aux1(1,1:ngm) + REAL( vaux(1:ngm))
-        aux1(2,1:ngm) = aux1(2,1:ngm) + AIMAG(vaux(1:ngm))
-        ehart = ehart + eh_corr
-        DEALLOCATE( rgtot, vaux )
+        print*, "v_h_gpu: do_comp_mt not implemented!"; flush(6); stop
+        !ALLOCATE( vaux( ngm ), rgtot(ngm) )
+        !rgtot(:) = rhog(:,1)
+        !if (nspin==2) rgtot(:) = rgtot(:) + rhog(:,2)
+        !CALL wg_corr_h (omega, ngm, rgtot, vaux, eh_corr)
+        !aux1(1,1:ngm) = aux1(1,1:ngm) + REAL( vaux(1:ngm))
+        !aux1(2,1:ngm) = aux1(2,1:ngm) + AIMAG(vaux(1:ngm))
+        !ehart = ehart + eh_corr
+        !DEALLOCATE( rgtot, vaux )
      end if
      !
      CALL mp_sum(  ehart , intra_bgrp_comm )
      ! 
-     aux(:) = 0.D0
+     aux_d(:) = 0.D0
      !
-     aux(nl(1:ngm)) = CMPLX ( aux1(1,1:ngm), aux1(2,1:ngm), KIND=dp )
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = 1, ngm
+       aux_d(nl_d(i)) = CMPLX ( aux1_d(1,i), aux1_d(2,i), KIND=dp )
+     end do
      !
      IF ( gamma_only ) THEN
         !
-        aux(nlm(1:ngm)) = CMPLX ( aux1(1,1:ngm), -aux1(2,1:ngm), KIND=dp )
+        print*, "v_h_gpu: gamma only not implemented!"; flush(6); stop
+        !aux(nlm(1:ngm)) = CMPLX ( aux1_d(1,1:ngm), -aux1_d(2,1:ngm), KIND=dp )
         !
      END IF
   END IF
   !
   ! ... transform hartree potential to real space
   !
-  CALL invfft ('Dense', aux, dfftp)
+  CALL invfft ('Dense', aux_d, dfftp)
   !
   ! ... add hartree potential to the xc potential
   !
   IF ( nspin == 4 ) THEN
      !
-     v(:,1) = v(:,1) + DBLE (aux(:))
+     !$cuf kernel do(1) <<<*,*>>>
+     do i = lbound(v_d, 1), ubound(v_d, 1)
+       v_d(i,1) = v_d(i,1) + DBLE (aux_d(i))
+     end do
      !
   ELSE
      !
      DO is = 1, nspin
-        !
-        v(:,is) = v(:,is) + DBLE (aux(:))
-        !
+       !$cuf kernel do(1) <<<*,*>>>
+       do i = lbound(v_d, 1), ubound(v_d, 1)
+         v_d(i,is) = v_d(i,is) + DBLE (aux_d(i))
+       end do
      END DO
      !
   END IF
+
   !
-  DEALLOCATE( aux, aux1 )
+  DEALLOCATE( aux_d, aux1_d)
   !
   CALL stop_clock( 'v_h' )
   !