diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 0975ebcd..6235b3ae 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -22,7 +22,9 @@ jobs:
     - name: Install dependencies
       run: |
         python3 -m pip install --upgrade pip
-        pip3 install flake8 pytest coverage gpu4pyscf-libxc-cuda11x pytest-cov
+        pip3 install flake8 pytest coverage gpu4pyscf-libxc-cuda11x pytest-cov dftd3 dftd4
+        pip3 install pyscf --upgrade
+        git config --global core.compression 9
     - name: Build GPU4PySCF
       run: |
         export CUDA_HOME=/usr/local/cuda
@@ -32,5 +34,5 @@ jobs:
     - name: Test with pytest
       run: |
         echo $GITHUB_WORKSPACE
-        export PYTHONPATH="${PYTHONPATH}:$GITHUB_WORKSPACE"
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
         pytest --cov=$GITHUB_WORKSPACE
diff --git a/examples/01-h2o_dftd3.py b/examples/01-h2o_dftd3.py
index 9081d141..7e7703ef 100644
--- a/examples/01-h2o_dftd3.py
+++ b/examples/01-h2o_dftd3.py
@@ -42,6 +42,7 @@
 mf_GPU.grids.level = grids_level
 mf_GPU.conv_tol = scf_tol
 mf_GPU.max_cycle = max_scf_cycles
+mf_GPU.conv_tol_cpscf = 1e-6
 mf_GPU.screen_tol = screen_tol
 
 # Compute Energy
@@ -50,7 +51,9 @@
 print('DFT energy by GPU4PySCF')
 print(e_dft)
 print('DFT energy by Q-Chem')
-print(-76.4672557846) # reference from q-chem: -76.4672557846
+e_qchem = -76.4672557846
+print(e_qchem) # reference from q-chem: -76.4672557846
+print('Energy diff between Q-Chem and PySCF', e_dft - e_qchem)
 
 # Compute Gradient
 print('------------------ Gradient ----------------------------')
@@ -61,9 +64,11 @@
 print(g_dft)
 print('Gradient by Q-Chem')
 # reference from q-chem
-print(np.array([[0.0000000,   0.0030278,  -0.0030278],
+g_qchem = np.array([[0.0000000,   0.0030278,  -0.0030278],
         [-0.0000000,  -0.0000000,   0.0000000],
-        [-0.0023449,   0.0011724,   0.0011724]]).T)
+        [-0.0023449,   0.0011724,   0.0011724]]).T
+print(g_qchem)
+print('Gradient diff between Q-Chem and PySCF', np.linalg.norm(g_dft - g_qchem))
 
 # Compute Hessian
 print('------------------- Hessian -----------------------------')
@@ -81,3 +86,4 @@
 print('Diagonals entries of Mass-weighted Hessian by Q-Chem')
 hess_qchem = np.loadtxt('hess_qchem.txt')
 print(np.diag(hess_qchem))
+print('Hessian diff between Q-Chem and PySCF', np.linalg.norm(hess_qchem - h_dft))
\ No newline at end of file
diff --git a/examples/06-h2o_hf.py b/examples/06-h2o_hf.py
index 342435dc..ec6e928d 100644
--- a/examples/06-h2o_hf.py
+++ b/examples/06-h2o_hf.py
@@ -13,8 +13,10 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-from pyscf import gto, scf, grad
+from pyscf import gto
+from gpu4pyscf import scf
 import numpy as np
+import cupy
 
 mol = gto.M(atom=
 '''
@@ -22,32 +24,27 @@
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
 ''',
-basis='cc-pvtz',
+basis='def2-tzvpp',
+cart=1,
 verbose=4)
 
-mf = scf.hf.RHF(mol)
-mf.direct_scf_tol = 1e-14
-mf.conv_tol = 1e-12
-e_cpu = mf.kernel()
-
-cpu_gradient = grad.rhf.Gradients(mf)
-cpu_gradient.kernel()
-
-from gpu4pyscf import scf
-import gpu4pyscf
-
+# Calculation on GPU
 mf = scf.hf.RHF(mol)
 mf.direct_scf_tol = 1e-14
 mf.conv_tol = 1e-12
 e_gpu = mf.kernel()
-gpu_gradient = gpu4pyscf.grad.rhf.Gradients(mf)
+
+gpu_gradient = mf.nuc_grad_method()
 gpu_gradient.kernel()
 
+# Move the object to CPU
+mf = mf.to_cpu()
+e_cpu = mf.kernel()
+cpu_gradient = mf.nuc_grad_method()
+cpu_gradient.kernel()
 cpu_de = cpu_gradient.de
-print(cpu_de)
-print(gpu_gradient.de)
-
 cpu_de -= np.sum(cpu_de, axis=0)/mol.natm
+
 print('e diff = ', e_cpu - e_gpu)
 print('g diff = \n', cpu_de - gpu_gradient.de)
 
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 1d143c0f..f2511af1 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -20,7 +20,7 @@
 import cupy
 import numpy
 from cupy import cublas
-from pyscf import lib, scf, __config__
+from pyscf import lib, __config__
 from pyscf.scf import dhf
 from pyscf.df import df_jk, addons
 from gpu4pyscf.lib import logger
@@ -97,7 +97,7 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
         with_df.verbose = mf.verbose
         with_df.auxbasis = auxbasis
 
-    if isinstance(mf, df_jk._DFHF):
+    if isinstance(mf, _DFHF):
         if mf.with_df is None:
             mf.with_df = with_df
         elif getattr(mf.with_df, 'auxbasis', None) != auxbasis:
@@ -240,7 +240,6 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1):
 
     def to_cpu(self):
         obj = self.undo_df().to_cpu().density_fit()
-        print(type(obj))
         return utils.to_cpu(self, obj)
 
 
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 9ba78132..fda72fc4 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -17,8 +17,7 @@
 import numpy
 import cupy
 from cupyx.scipy.linalg import solve_triangular
-from pyscf.df.grad import rhf
-from pyscf import lib, scf, gto
+from pyscf import scf
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d
 from gpu4pyscf.grad import rhf as rhf_grad
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index db105679..afe1ccf0 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -19,9 +19,8 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import scf
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d
+from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library, take_last2d
 from gpu4pyscf.grad import uhf as uhf_grad
-from gpu4pyscf.grad.rhf import grad_elec
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
 
@@ -62,13 +61,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
     if mo_occ is None:
         mo_occ = cupy.asarray(mf_grad.base.mo_occ)
-    sph_ao_idx = intopt.sph_ao_idx
-    dm = take_last2d(dm0, sph_ao_idx)
+    ao_idx = intopt.ao_idx
+    dm = take_last2d(dm0, ao_idx)
     if dm2 is not None:
-        dm2_tmp = take_last2d(dm2, sph_ao_idx)
+        dm2_tmp = take_last2d(dm2, ao_idx)
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
     orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
-    orbo = orbo[sph_ao_idx, :]
+    orbo = orbo[ao_idx, :]
     nocc = orbo.shape[-1]
 
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -109,8 +108,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     else:
         int2c_e1 = auxmol.intor('int2c2e_ip1')
     int2c_e1 = cupy.asarray(int2c_e1)
-    sph_aux_idx = intopt.sph_aux_idx
-    rev_aux_idx = np.argsort(sph_aux_idx)
+    aux_ao_idx = intopt.aux_ao_idx
+    rev_aux_idx = np.argsort(aux_ao_idx)
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low_t = low.T.copy()
@@ -123,8 +122,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             rhoj = solve_triangular(low_t, rhoj, lower=False, overwrite_b=True)
             if dm2 is not None:
                 rhoj2 = solve_triangular(low_t, rhoj2, lower=False, overwrite_b=True)
-        rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
+        if not auxmol.cart:
+            rhoj_cart = contract('pq,q->p', aux_cart2sph, rhoj)
+        else:
+            rhoj_cart = rhoj
+
         rhoj = rhoj[rev_aux_idx]
+
         if dm2 is not None:
             rhoj2 = rhoj2[rev_aux_idx]
         tmp = contract('xpq,q->xp', int2c_e1, rhoj)
@@ -144,7 +148,10 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
         vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
         vkaux = tmp = None
-        rhok_cart = contract('pq,qkl->pkl', aux_cart2sph, rhok)
+        if not auxmol.cart:
+            rhok_cart = contract('pq,qkl->pkl', aux_cart2sph, rhok)
+        else:
+            rhok_cart = rhok
         rhok = None
     low_t = None
     t0 = log.timer_debug1('rhoj and rhok', *t0)
@@ -156,15 +163,21 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     # rebuild with aosym
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
                  group_size_aux=block_size)#, group_size=block_size)
-
-    # sph2cart for ao
-    cart2sph = intopt.cart2sph
-    orbo_cart = cart2sph @ orbo
-    if dm2 is None:
-        dm_cart = cart2sph @ dm @ cart2sph.T
+    if not intopt._mol.cart:
+        # sph2cart for ao
+        cart2sph = intopt.cart2sph
+        orbo_cart = cart2sph @ orbo
+        if dm2 is None:
+            dm_cart = cart2sph @ dm @ cart2sph.T
+        else:
+            dm2_tmp = take_last2d(dm2, ao_idx)
+            dm_cart = cart2sph @ dm2_tmp @ cart2sph.T
     else:
-        dm2_tmp = take_last2d(dm2, sph_ao_idx)
-        dm_cart = cart2sph @ dm2_tmp @ cart2sph.T
+        if dm2 is None:
+            dm_cart = dm
+        else:
+            dm_cart = take_last2d(dm2, ao_idx)
+        orbo_cart = orbo
     dm = orbo = None
 
     vj = vk = rhoj_tmp = rhok_tmp = None
@@ -272,10 +285,12 @@ def get_k(self, mol=None, dm=None, hermi=0, mo_coeff=None, mo_occ=None, dm2 = No
 
 
     def get_veff(self, mol=None, dm=None):
-        vj0, vk0, vjaux0, vkaux0 = self.get_jk(mol, dm[0], mo_coeff=self.base.mo_coeff[0], mo_occ=self.base.mo_occ[0])
-        vj1, vk1, vjaux1, vkaux1 = self.get_jk(mol, dm[1], mo_coeff=self.base.mo_coeff[1], mo_occ=self.base.mo_occ[1])
-        vj0_m1, vjaux0_m1 = self.get_j(mol, dm[0], mo_coeff=self.base.mo_coeff[0], mo_occ=self.base.mo_occ[0], dm2=dm[1])
-        vj1_m0, vjaux1_m0 = self.get_j(mol, dm[1], mo_coeff=self.base.mo_coeff[1], mo_occ=self.base.mo_occ[1], dm2=dm[0])
+        mo_a, mo_b = self.base.mo_coeff
+        mo_occa, mo_occb = self.base.mo_occ
+        vj0, vk0, vjaux0, vkaux0 = self.get_jk(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa)
+        vj1, vk1, vjaux1, vkaux1 = self.get_jk(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb)
+        vj0_m1, vjaux0_m1 = self.get_j(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa, dm2=dm[1])
+        vj1_m0, vjaux1_m0 = self.get_j(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb, dm2=dm[0])
         vhf = vj0 + vj1 + vj0_m1 + vj1_m0 - vk0 - vk1
         if self.auxbasis_response:
             e1_aux = vjaux0 + vjaux1 + vjaux0_m1 + vjaux1_m0 - vkaux0 - vkaux1
diff --git a/gpu4pyscf/df/grad/uks.py b/gpu4pyscf/df/grad/uks.py
index ada2121b..cdd3d918 100644
--- a/gpu4pyscf/df/grad/uks.py
+++ b/gpu4pyscf/df/grad/uks.py
@@ -91,20 +91,23 @@ def get_veff(ks_grad, mol=None, dm=None):
     vxc = cupy.asarray(vxc)
 
     if not ni.libxc.is_hybrid_xc(mf.xc):
-        vj0, vjaux0 = ks_grad.get_j(mol, dm[0], mo_coeff=ks_grad.base.mo_coeff[0], mo_occ=ks_grad.base.mo_occ[0])
-        vj1, vjaux1 = ks_grad.get_j(mol, dm[1], mo_coeff=ks_grad.base.mo_coeff[1], mo_occ=ks_grad.base.mo_occ[1])
-        vj0_m1, vjaux0_m1 = ks_grad.get_j(mol, dm[0], mo_coeff=ks_grad.base.mo_coeff[0], mo_occ=ks_grad.base.mo_occ[0], dm2=dm[1])
-        vj1_m0, vjaux1_m0 = ks_grad.get_j(mol, dm[1], mo_coeff=ks_grad.base.mo_coeff[1], mo_occ=ks_grad.base.mo_occ[1], dm2=dm[0])
+        mo_a, mo_b = ks_grad.base.mo_coeff
+        mo_occa, mo_occb = ks_grad.base.mo_occ
+        vj0, vjaux0 = ks_grad.get_j(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa)
+        vj1, vjaux1 = ks_grad.get_j(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb)
+        vj0_m1, vjaux0_m1 = ks_grad.get_j(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa, dm2=dm[1])
+        vj1_m0, vjaux1_m0 = ks_grad.get_j(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb, dm2=dm[0])
         if ks_grad.auxbasis_response:
             e1_aux = vjaux0 + vjaux1 + vjaux0_m1 + vjaux1_m0
         vxc += vj0 + vj1 + vj0_m1 + vj1_m0
     else:
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
-
-        vj0, vk0, vjaux0, vkaux0 = ks_grad.get_jk(mol, dm[0], mo_coeff=ks_grad.base.mo_coeff[0], mo_occ=ks_grad.base.mo_occ[0])
-        vj1, vk1, vjaux1, vkaux1 = ks_grad.get_jk(mol, dm[1], mo_coeff=ks_grad.base.mo_coeff[1], mo_occ=ks_grad.base.mo_occ[1])
-        vj0_m1, vjaux0_m1 = ks_grad.get_j(mol, dm[0], mo_coeff=ks_grad.base.mo_coeff[0], mo_occ=ks_grad.base.mo_occ[0], dm2=dm[1])
-        vj1_m0, vjaux1_m0 = ks_grad.get_j(mol, dm[1], mo_coeff=ks_grad.base.mo_coeff[1], mo_occ=ks_grad.base.mo_occ[1], dm2=dm[0])
+        mo_a, mo_b = ks_grad.base.mo_coeff
+        mo_occa, mo_occb = ks_grad.base.mo_occ
+        vj0, vk0, vjaux0, vkaux0 = ks_grad.get_jk(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa)
+        vj1, vk1, vjaux1, vkaux1 = ks_grad.get_jk(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb)
+        vj0_m1, vjaux0_m1 = ks_grad.get_j(mol, dm[0], mo_coeff=mo_a, mo_occ=mo_occa, dm2=dm[1])
+        vj1_m0, vjaux1_m0 = ks_grad.get_j(mol, dm[1], mo_coeff=mo_b, mo_occ=mo_occb, dm2=dm[0])
         vj = vj0 + vj1 + vj0_m1 + vj1_m0
         vk = (vk0 + vk1) * hyb
         if ks_grad.auxbasis_response:
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
index 4096f1f6..d06ffff5 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
@@ -16,13 +16,11 @@
 import unittest
 import numpy
 import cupy
-from pyscf import gto, scf, lib
-from pyscf import grad, hessian
+from pyscf import gto, scf
 from pyscf.df.hessian import uhf as df_uhf_cpu
 from pyscf.hessian import uhf as uhf_cpu
 from gpu4pyscf.df.hessian import uhf as df_uhf_gpu
 from gpu4pyscf.hessian import uhf as uhf_gpu
-from gpu4pyscf import scf as scf_gpu
 
 def setUpModule():
     global mol
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
index 78066b21..490608e6 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uks_hessian.py
@@ -15,11 +15,7 @@
 
 import unittest
 import numpy
-import cupy
-from pyscf import gto, scf, lib, dft
-from pyscf import grad, hessian
-from pyscf.df.hessian import uhf as df_uks_cpu
-from gpu4pyscf.df.hessian import uks as df_uks_gpu
+from pyscf import gto, dft
 
 def setUpModule():
     global mol
diff --git a/gpu4pyscf/df/tests/test_df_ecp.py b/gpu4pyscf/df/tests/test_df_ecp.py
index f76f1f0e..1b153121 100644
--- a/gpu4pyscf/df/tests/test_df_ecp.py
+++ b/gpu4pyscf/df/tests/test_df_ecp.py
@@ -16,11 +16,8 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
 from gpu4pyscf.dft import rks
 
-lib.num_threads(8)
-
 xc='pbe0'
 bas='def2-tzvpp'
 auxbasis='def2-tzvpp-jkfit'
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index afdb06aa..d777c91d 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -47,9 +47,17 @@ def tearDownModule():
     mol_cart.stdout.close()
     del mol_sph, mol_cart
 
-def _make_rhf(mol):
-    mf = scf.RHF(mol).density_fit(auxbasis='ccpvtz-jkfit')
+def _make_rhf(mol, disp=None):
+    mf = scf.RHF(mol).density_fit()
     mf.conv_tol = 1e-12
+    mf.disp = disp
+    mf.kernel()
+    return mf
+
+def _make_uhf(mol, disp=None):
+    mf = scf.UHF(mol).density_fit()
+    mf.conv_tol = 1e-12
+    mf.disp = disp
     mf.kernel()
     return mf
 
@@ -120,9 +128,10 @@ def _check_dft_hessian(mf, h, ix=0, iy=0, tol=1e-3):
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
 class KnownValues(unittest.TestCase):
-    def test_hessian_rhf(self):
+    def test_hessian_rhf(self, disp=None):
         print('-----testing DF RHF Hessian----')
         mf = _make_rhf(mol_sph)
+        mf.disp = disp
         mf.conv_tol_cpscf = 1e-7
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
@@ -130,7 +139,7 @@ def test_hessian_rhf(self):
         _check_rhf_hessian(mf, h, ix=0, iy=0)
         _check_rhf_hessian(mf, h, ix=0, iy=1)
 
-    def test_hessian_lda(self):
+    def test_hessian_lda(self, disp=None):
         print('-----testing DF LDA Hessian----')
         mf = _make_rks(mol_sph, 'LDA')
         mf.conv_tol_cpscf = 1e-7
@@ -190,37 +199,77 @@ def test_hessian_rsh(self):
         _check_dft_hessian(mf, h, ix=0,iy=0)
         _check_dft_hessian(mf, h, ix=0,iy=1)
 
-    def test_hessian_D3(self):
-        pmol = mol_sph.copy()
-        pmol.build()
+    def test_hessian_rhf_D3(self):
+        print('----- testing DFRHF with D3BJ ------')
+        mf = _make_rhf(mol_sph, disp='d3bj')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
-        mf = dft.rks.RKS(pmol, xc='B3LYP', disp='d3bj').density_fit(auxbasis=auxbasis0)
-        mf.conv_tol = 1e-12
+    def test_hessian_rhf_D4(self):
+        print('------ testing DFRKS with D4 --------')
+        mf = _make_rhf(mol_sph, disp='d4')
         mf.conv_tol_cpscf = 1e-7
-        mf.grids.level = grids_level
-        mf.verbose = 1
-        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+
+    def test_hessian_uhf_D3(self):
+        print('-------- testing DFUHF with D3BJ ------')
+        mf = _make_uhf(mol_sph, disp='d3bj')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+
+    def test_hessian_uhf_D4(self):
+        print('--------- testing DFUHF with D4 -------')
+        mf = _make_uhf(mol_sph, disp='d4')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
+    def test_hessian_rks_D3(self):
+        print('--------- testing DFRKS with D3BJ -------')
+        mf = _make_rks(mol_sph, 'b3lyp', disp='d3bj')
+        mf.conv_tol_cpscf = 1e-7
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
-        hobj.verbose=0
-        hobj.kernel()
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
-    def test_hessian_D4(self):
-        pmol = mol_sph.copy()
-        pmol.build()
+    def test_hessian_rks_D4(self):
+        print('----------- testing DFRKS with D4 --------')
+        mf = _make_rks(mol_sph, 'b3lyp', disp='d4')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
-        mf = dft.rks.RKS(pmol, xc='B3LYP', disp='d4').density_fit(auxbasis=auxbasis0)
-        mf.conv_tol = 1e-12
+    def test_hessian_uks_D3(self):
+        print('------------ testing DFUKS with D3BJ -------')
+        mf = _make_uks(mol_sph, 'b3lyp', disp='d3bj')
         mf.conv_tol_cpscf = 1e-7
-        mf.grids.level = grids_level
-        mf.verbose = 1
-        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
+    def test_hessian_uks_D4(self):
+        print('------------- testing DFUKS with D4 ---------')
+        mf = _make_uks(mol_sph, 'b3lyp', disp='d4')
+        mf.conv_tol_cpscf = 1e-7
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
-        hobj.verbose=0
-        hobj.kernel()
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
 
     def test_hessian_cart(self):
         print('-----testing DF Hessian (cartesian)----')
@@ -232,6 +281,16 @@ def test_hessian_cart(self):
         _check_dft_hessian(mf, h, ix=0,iy=0)
         _check_dft_hessian(mf, h, ix=0,iy=1)
 
+    def test_hessian_uks_cart(self):
+        print('-----testing DF Hessian (cartesian)----')
+        mf = _make_uks(mol_cart, 'b3lyp')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
+
 if __name__ == "__main__":
     print("Full Tests for DF Hessian")
     unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py
index 092445eb..3852c70b 100644
--- a/gpu4pyscf/df/tests/test_df_rhf.py
+++ b/gpu4pyscf/df/tests/test_df_rhf.py
@@ -16,14 +16,11 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
 from pyscf import scf as cpu_scf
 from pyscf.df import df_jk as cpu_df_jk
 from gpu4pyscf.df import df_jk as gpu_df_jk
 from gpu4pyscf import scf as gpu_scf
 
-lib.num_threads(8)
-
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
@@ -33,44 +30,75 @@
 bas='def2tzvpp'
 
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=0)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
 class KnownValues(unittest.TestCase):
     '''
     known values are obtained by Q-Chem
     '''
     def test_rhf(self):
-        print('------- HF -----------------')
-        mf = gpu_scf.RHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit')
+        print('------- RHF -----------------')
+        mf = gpu_scf.RHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
         e_tot = mf.kernel()
         e_qchem = -76.0624582299
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
+
+    def test_cart(self):
+        print('------- RHF Cart -----------------')
+        mf = gpu_scf.RHF(mol_cart).density_fit(auxbasis='def2-tzvpp-jkfit')
+        e_gpu = mf.kernel()
+        e_cpu = mf.to_cpu().kernel()
+        print(f'diff from pyscf {e_gpu - e_cpu}')
+        assert np.abs(e_gpu - e_cpu) < 1e-5
+
+    def test_rhf_d3(self):
+        print('--------- RHF with D3(BJ) ---------')
+        mf = gpu_scf.RHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.disp = 'd3bj'
+        e_tot = mf.kernel()
+        e_qchem = -76.0669672259
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.abs(e_tot - e_qchem) < 1e-5
+
+    def test_rhf_d4(self):
+        print('-------- RHF with D4 ----------')
+        mf = gpu_scf.RHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.disp = 'd4'
+        e_tot = mf.kernel()
+        e_ref = -76.06343941431916 #-76.0669672259
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_to_cpu(self):
-        mf = gpu_scf.RHF(mol).density_fit()
+        mf = gpu_scf.RHF(mol_sph).density_fit()
         e_gpu = mf.kernel()
         mf = mf.to_cpu()
         assert isinstance(mf, cpu_df_jk._DFHF)
         e_cpu = mf.kernel()
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_to_gpu(self):
-        mf = cpu_scf.RHF(mol).density_fit()
+        mf = cpu_scf.RHF(mol_sph).density_fit()
         e_cpu = mf.kernel()
         mf = mf.to_gpu()
         assert isinstance(mf, gpu_df_jk._DFHF)
         e_gpu = mf.kernel()
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for restricted Hartree-Fock")
diff --git a/gpu4pyscf/df/tests/test_df_rks.py b/gpu4pyscf/df/tests/test_df_rks.py
index 25cf3f9f..89b33c6e 100644
--- a/gpu4pyscf/df/tests/test_df_rks.py
+++ b/gpu4pyscf/df/tests/test_df_rks.py
@@ -16,14 +16,11 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
 from pyscf.df import df_jk as cpu_df_jk
 from pyscf.dft import rks as cpu_rks
 from gpu4pyscf.dft import rks as gpu_rks
 from gpu4pyscf.df import df_jk as gpu_df_jk
 
-lib.num_threads(8)
-
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
@@ -50,11 +47,12 @@ def tearDownModule():
     mol_cart.stdout.close()
     del mol_sph, mol_cart
 
-def run_dft(xc, mol):
+def run_dft(xc, mol, disp=None):
     mf = gpu_rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-tzvpp-jkfit')
     mf.grids.atom_grid = (99,590)
     mf.nlcgrids.atom_grid = (50,194)
     mf.conv_tol = 1e-10
+    mf.disp = disp
     e_dft = mf.kernel()
     return e_dft
 
@@ -68,42 +66,56 @@ def test_rks_lda(self):
         e_tot = run_dft("LDA,VWN5", mol_sph)
         e_qchem = -75.9046768207
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_rks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE', mol_sph)
         e_qchem = -76.3800605005
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_rks_b3lyp(self):
         print('-------- B3LYP -------------')
         e_tot = run_dft('B3LYP', mol_sph)
         e_qchem = -76.4666819950
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_rks_m06(self):
         print('--------- M06 --------------')
         e_tot = run_dft("M06", mol_sph)
         e_qchem = -76.4266137244
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_rks_wb97(self):
         print('-------- wB97 --------------')
         e_tot = run_dft("HYB_GGA_XC_WB97", mol_sph)
         e_qchem = -76.4486707747
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_rks_wb97m_v(self):
         print('-------- wB97m-v --------------')
         e_tot = run_dft("HYB_MGGA_XC_WB97M_V", mol_sph)
         e_qchem = -76.4334564629
         print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        assert np.abs(e_tot - e_qchem) < 1e-5
+
+    def test_rks_b3lyp_d3(self):
+        print('-------- B3LYP with D3(BJ) -------------')
+        e_tot = run_dft('B3LYP', mol_sph, disp='d3bj')
+        e_qchem = -76.4672558312 # w/o D3(BJ) -76.4666819950
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.abs(e_tot - e_qchem) < 1e-5
+
+    def test_rks_b3lyp_d4(self):
+        print('-------- B3LYP with D4 ---------------')
+        e_tot = run_dft('B3LYP', mol_sph, disp='D4')
+        e_qchem = -76.4669915146 # w/o D3(BJ) -76.4666819950
+        print(f'diff from qchem {e_tot - e_qchem}')
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
     def test_to_cpu(self):
         mf = gpu_rks.RKS(mol_sph).density_fit()
@@ -111,7 +123,7 @@ def test_to_cpu(self):
         mf = mf.to_cpu()
         assert isinstance(mf, cpu_df_jk._DFHF)
         e_cpu = mf.kernel()
-        np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_to_gpu(self):
         mf = cpu_rks.RKS(mol_sph).density_fit()
@@ -119,14 +131,14 @@ def test_to_gpu(self):
         mf = mf.to_gpu()
         assert isinstance(mf, gpu_df_jk._DFHF)
         e_cpu = mf.kernel()
-        np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_rks_cart(self):
         print('-------- B3LYP (CART) -------------')
         e_tot = run_dft('B3LYP', mol_cart)
         e_qchem = -76.46723795965626 # data from PySCF
-        print(f'diff from qchem {e_tot - e_qchem}')
-        assert np.allclose(e_tot, e_qchem)
+        print(f'diff from pyscf {e_tot - e_qchem}')
+        assert np.abs(e_tot - e_qchem) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for restricted Kohn-Sham")
diff --git a/gpu4pyscf/df/tests/test_df_rks_grad.py b/gpu4pyscf/df/tests/test_df_rks_grad.py
index 1bd3b2b3..76fa3740 100644
--- a/gpu4pyscf/df/tests/test_df_rks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_rks_grad.py
@@ -17,11 +17,8 @@
 import cupy
 import numpy as np
 import unittest
-from pyscf import lib
 from gpu4pyscf.dft import rks
 
-lib.num_threads(8)
-
 '''
 test density fitting for dft
 1. energy
@@ -139,6 +136,24 @@ def test_grad_nlc(self):
     def test_grad_cart(self):
         print('------ Cart testing--------')
         _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-6)
+
+    def test_grad_d3(self):
+        print('------ B3LYP with d3bj --------')
+        _check_grad(mol_cart, xc='B3LYP', disp='d3bj', tol=1e-6)
+
+    def test_grad_d4(self):
+        print('------ B3LYP with d4 --------')
+        _check_grad(mol_cart, xc='B3LYP', disp='d4', tol=1e-6)
+
+    def test_to_cpu(self):
+        mf = rks.RKS(mol_sph, xc='b3lyp').density_fit()
+        mf.kernel()
+
+        gobj = mf.nuc_grad_method()
+        g_gpu = gobj.kernel()
+
+        g_cpu = gobj.to_cpu().kernel()
+        assert np.linalg.norm(g_cpu - g_gpu) < 1e-5
 if __name__ == "__main__":
     print("Full Tests for DF Gradient")
     unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_uhf.py b/gpu4pyscf/df/tests/test_df_uhf.py
index face0354..aa81eba9 100644
--- a/gpu4pyscf/df/tests/test_df_uhf.py
+++ b/gpu4pyscf/df/tests/test_df_uhf.py
@@ -17,14 +17,11 @@
 import numpy as np
 import cupy
 import pyscf
-from pyscf import lib
 from pyscf import scf as cpu_scf
 from pyscf.df import df_jk as cpu_df_jk
 from gpu4pyscf import scf as gpu_scf
 from gpu4pyscf.df import df_jk as gpu_df_jk
 
-lib.num_threads(8)
-
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
@@ -34,21 +31,28 @@
 bas='def2tzvpp'
 
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, cart=1, max_memory=32000)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def _check_grad(tol=1e-5):
-    mf = gpu_scf.uhf.UHF(mol)#.density_fit()
+def _check_grad(mol, tol=1e-5, disp=None):
+    mf = gpu_scf.uhf.UHF(mol).density_fit()
     mf.conv_tol = 1e-10
     mf.verbose = 1
+    mf.disp = disp
     mf.kernel()
 
     g = mf.nuc_grad_method()
@@ -91,30 +95,67 @@ class KnownValues(unittest.TestCase):
     '''
     def test_uhf(self):
         print('------- UHF -----------------')
-        mf = gpu_scf.UHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf = gpu_scf.UHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
         e_tot = mf.kernel()
         e_pyscf = -75.6599919479438
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
+    def test_cart(self):
+        print('------- cart UHF -------------')
+        mf = gpu_scf.UHF(mol_cart).density_fit(auxbasis='def2-tzvpp-jkfit')
+        e_gpu = mf.kernel()
+        e_cpu = mf.to_cpu().kernel()
+        print(f'diff from pyscf {e_gpu - e_cpu}')
+        assert np.abs(e_gpu - e_cpu) < 1e-5
+
+    def test_uhf_d3(self):
+        print('------- UHF with D3(BJ) -----')
+        mf = gpu_scf.UHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.disp = 'd3bj'
+        e_tot = mf.kernel()
+        e_pyscf = -75.6645005436
+        print(f'diff from pyscf {e_tot - e_pyscf}')
+        assert np.abs(e_tot - e_pyscf) < 1e-5
+    '''
+    def test_uhf_d4(self):
+        print('------- UHF with D4 -------')
+        mf = gpu_scf.UHF(mol).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf.disp = 'd4'
+        e_tot = mf.kernel()
+        e_pyscf = -75.66097302959608
+        print(f'diff from pyscf {e_tot - e_pyscf}')
+        assert np.abs(e_tot - e_pyscf) < 1e-5
+    '''
     def test_grad_uhf(self):
-        _check_grad(tol=1e-5)
+        _check_grad(mol_sph, tol=1e-5)
+
+    def test_grad_cart(self):
+        print('-------- UHF Cart Gradient ------')
+        _check_grad(mol_cart, tol=1e-5)
+
+    def test_grad_uhf_d3(self):
+        _check_grad(mol_sph, tol=1e-5, disp='d3bj')
+
+    def test_grad_uhf_d4(self):
+        _check_grad(mol_sph, tol=1e-5, disp='d4')
+
 
     def test_to_cpu(self):
-        mf = gpu_scf.UHF(mol).density_fit()
+        mf = gpu_scf.UHF(mol_sph).density_fit()
         e_gpu = mf.kernel()
         mf = mf.to_cpu()
         e_cpu = mf.kernel()
         assert isinstance(mf, cpu_df_jk._DFHF)
-        assert np.allclose(e_gpu, e_cpu)
+        assert np.abs(e_gpu - e_cpu) < 1e-5
 
     def test_to_gpu(self):
-        mf = cpu_scf.UHF(mol).density_fit()
+        mf = cpu_scf.UHF(mol_sph).density_fit()
         e_cpu = mf.kernel()
         mf = mf.to_gpu()
         e_gpu = mf.kernel()
         assert isinstance(mf, gpu_df_jk._DFHF)
-        assert np.allclose(e_gpu, e_cpu)
+        assert np.abs(e_gpu - e_cpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for unrestricted Hartree-Fock")
diff --git a/gpu4pyscf/df/tests/test_df_uks.py b/gpu4pyscf/df/tests/test_df_uks.py
index 8d1935be..919e25ab 100644
--- a/gpu4pyscf/df/tests/test_df_uks.py
+++ b/gpu4pyscf/df/tests/test_df_uks.py
@@ -16,14 +16,11 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
 from pyscf.dft import uks as cpu_uks
 from pyscf.df import df_jk as cpu_df_jk
 from gpu4pyscf.dft import uks as gpu_uks
 from gpu4pyscf.df import df_jk as gpu_df_jk
 
-lib.num_threads(8)
-
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
@@ -34,22 +31,29 @@
 grids_level = 5
 
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, charge=1, spin=1, cart=1, max_memory=32000)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def run_dft(xc):
+def run_dft(mol, xc, disp=None):
     mf = gpu_uks.UKS(mol, xc=xc).density_fit(auxbasis='def2-tzvpp-jkfit')
     mf.grids.atom_grid = (99,590)
     mf.nlcgrids.atom_grid = (50,194)
     mf.conv_tol = 1e-10
+    mf.disp = disp
     e_dft = mf.kernel()
     return e_dft
 
@@ -59,61 +63,83 @@ class KnownValues(unittest.TestCase):
     '''
     def test_uks_lda(self):
         print('------- LDA ----------------')
-        e_tot = run_dft("LDA,VWN5")
+        e_tot = run_dft(mol_sph, "LDA,VWN5")
         e_pyscf = -75.42319302444447
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_uks_pbe(self):
         print('------- PBE ----------------')
-        e_tot = run_dft('PBE')
+        e_tot = run_dft(mol_sph, 'PBE')
         e_pyscf = -75.91291185761159
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_uks_b3lyp(self):
         print('-------- B3LYP -------------')
-        e_tot = run_dft('B3LYP')
+        e_tot = run_dft(mol_sph, 'B3LYP')
         e_pyscf = -75.9987750880688
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_uks_m06(self):
         print('--------- M06 --------------')
-        e_tot = run_dft("M06")
+        e_tot = run_dft(mol_sph, "M06")
         e_pyscf = -75.96097588711966
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_uks_wb97(self):
         print('-------- wB97 --------------')
-        e_tot = run_dft("HYB_GGA_XC_WB97")
+        e_tot = run_dft(mol_sph, "HYB_GGA_XC_WB97")
         e_pyscf = -75.98337641724999
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_uks_wb97m_v(self):
         print('-------- wB97m-v --------------')
-        e_tot = run_dft("HYB_MGGA_XC_WB97M_V")
+        e_tot = run_dft(mol_sph, "HYB_MGGA_XC_WB97M_V")
         e_pyscf = -75.96980058343685
         print(f'diff from pyscf {e_tot - e_pyscf}')
-        assert np.allclose(e_tot, e_pyscf)
+        assert np.abs(e_tot - e_pyscf) < 1e-5
+
+    def test_uks_b3lyp_d3(self):
+        print('-------- B3LYP D3(BJ) -------------')
+        e_tot = run_dft(mol_sph, 'B3LYP', disp='d3bj')
+        e_pyscf = -75.9993489428 #-75.9987750880688
+        print(f'diff from pyscf {e_tot - e_pyscf}')
+        assert np.abs(e_tot - e_pyscf) < 1e-5
+
+    def test_uks_b3lyp_d4(self):
+        print('-------- B3LYP D4 -------------')
+        e_tot = run_dft(mol_sph, 'B3LYP', disp='d4')
+        e_pyscf =  -75.9989312099 #-75.9987750880688
+        print(f'diff from pyscf {e_tot - e_pyscf}')
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
     def test_to_cpu(self):
-        mf = gpu_uks.UKS(mol).density_fit()
+        mf = gpu_uks.UKS(mol_sph).density_fit()
         e_gpu = mf.kernel()
         mf = mf.to_cpu()
         assert isinstance(mf, cpu_df_jk._DFHF)
         e_cpu = mf.kernel()
-        np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_to_gpu(self):
-        mf = cpu_uks.UKS(mol).density_fit()
+        mf = cpu_uks.UKS(mol_sph).density_fit()
         e_gpu = mf.kernel()
         mf = mf.to_gpu()
         assert isinstance(mf, gpu_df_jk._DFHF)
         e_cpu = mf.kernel()
-        np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
+
+    def test_uks_cart(self):
+        print('-------- B3LYP cart-------------')
+        mf = gpu_uks.UKS(mol_cart, xc='B3LYP').density_fit()
+        e_tot = mf.kernel()
+        e_pyscf = mf.to_cpu().kernel()
+        print(f'diff from pyscf {e_tot - e_pyscf}')
+        assert np.abs(e_tot - e_pyscf) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for unrestricted Kohn-Sham")
diff --git a/gpu4pyscf/df/tests/test_df_uks_grad.py b/gpu4pyscf/df/tests/test_df_uks_grad.py
index 880c9691..3a998f41 100644
--- a/gpu4pyscf/df/tests/test_df_uks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_uks_grad.py
@@ -17,11 +17,8 @@
 import cupy
 import numpy as np
 import unittest
-from pyscf import lib
 from gpu4pyscf.dft import uks
 
-lib.num_threads(8)
-
 '''
 test density fitting for dft
 1. energy
@@ -42,25 +39,32 @@
 grids_level = 6
 nlcgrids_level = 3
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1)
-    mol.output = '/dev/null'
-    mol.verbose = 1
-    mol.build()
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1)
+    mol_sph.output = '/dev/null'
+    mol_sph.verbose = 1
+    mol_sph.build()
+
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.verbose = 1
+    mol_cart.build()
 
 eps = 1.0/1024
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def _check_grad(grid_response=True, xc=xc0, disp=disp0, tol=1e-5):
+def _check_grad(mol, grid_response=True, xc=xc0, disp=disp0, tol=1e-5):
     mf = uks.UKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
     mf.grids.level = grids_level
     mf.nlcgrids.level = nlcgrids_level
     mf.conv_tol = 1e-10
     mf.verbose = 1
+    mf.disp = disp
     mf.kernel()
 
     g = mf.nuc_grad_method()
@@ -101,36 +105,60 @@ class KnownValues(unittest.TestCase):
 
     def test_grad_with_grids_response(self):
         print("-----testing DF DFT gradient with grids response----")
-        _check_grad(grid_response=True, disp=None)
+        _check_grad(mol_sph, grid_response=True, disp=None)
 
     def test_grad_without_grids_response(self):
         print('-----testing DF DFT gradient without grids response----')
-        _check_grad(grid_response=False, disp=None)
+        _check_grad(mol_sph, grid_response=False, disp=None)
 
     def test_grad_lda(self):
         print("-----LDA testing-------")
-        _check_grad(xc='LDA', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='LDA', disp=None, tol=1e-5)
 
     def test_grad_gga(self):
         print('-----GGA testing-------')
-        _check_grad(xc='PBE', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='PBE', disp=None, tol=1e-5)
 
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
-        _check_grad(xc='B3LYP', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='B3LYP', disp=None, tol=1e-5)
 
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
-        _check_grad(xc='tpss', disp=None, tol=1e-3)
+        _check_grad(mol_sph, xc='tpss', disp=None, tol=1e-3)
 
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
-        _check_grad(xc='wb97', disp=None, tol=1e-3)
+        _check_grad(mol_sph, xc='wb97', disp=None, tol=1e-3)
+
+    '''
+    # Not implemented yet
+    def test_grad_nlc(self):
+        print('--------nlc testing-------------')
+        _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-6)
+    '''
+
+    def test_grad_d3(self):
+        print('------ B3LYP with d3bj --------')
+        _check_grad(mol_sph, xc='B3LYP', disp='d3bj', tol=1e-5)
+
+    def test_grad_d4(self):
+        print('------ B3LYP with d4 --------')
+        _check_grad(mol_sph, xc='B3LYP', disp='d4', tol=1e-5)
+
+    def test_grad_cart(self):
+        print('------ Cart testing--------')
+        _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-5)
+
+    def test_to_cpu(self):
+        mf = uks.UKS(mol_sph, xc='b3lyp').density_fit()
+        mf.kernel()
+
+        gobj = mf.nuc_grad_method()
+        g_gpu = gobj.kernel()
 
-    # Not implemented!
-    # def test_grad_nlc(self):
-    #     print('--------nlc testing-------------')
-    #     _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-6)
+        g_cpu = gobj.to_cpu().kernel()
+        assert np.linalg.norm(g_cpu - g_gpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for UKS DF Gradient")
diff --git a/gpu4pyscf/df/tests/test_geomopt.py b/gpu4pyscf/df/tests/test_geomopt.py
index 59c5d86f..f9ea444e 100644
--- a/gpu4pyscf/df/tests/test_geomopt.py
+++ b/gpu4pyscf/df/tests/test_geomopt.py
@@ -16,13 +16,10 @@
 import pyscf
 import numpy as np
 import unittest
-from pyscf import lib
 from gpu4pyscf import scf
 from gpu4pyscf.dft import rks, uks
 from pyscf.geomopt.geometric_solver import optimize
 
-lib.num_threads(8)
-
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
diff --git a/gpu4pyscf/df/tests/test_int3c2e.py b/gpu4pyscf/df/tests/test_int3c2e.py
index a94f3db3..7ce638e2 100644
--- a/gpu4pyscf/df/tests/test_int3c2e.py
+++ b/gpu4pyscf/df/tests/test_int3c2e.py
@@ -14,15 +14,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import pyscf
-from pyscf import lib, df
+from pyscf import df
 from pyscf.gto.moleintor import getints, make_cintopt
 from pyscf.df.grad.rhf import _int3c_wrapper
 import numpy as np
-import cupy
 import unittest
-
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import load_library
+
 libgint = load_library('libgint')
 
 '''
diff --git a/gpu4pyscf/df/tests/test_jk.py b/gpu4pyscf/df/tests/test_jk.py
index 8f045dc9..8ebf8690 100644
--- a/gpu4pyscf/df/tests/test_jk.py
+++ b/gpu4pyscf/df/tests/test_jk.py
@@ -14,20 +14,12 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import unittest
-import numpy as np
 import cupy
 import pyscf
-from pyscf import lib, df
-from pyscf import scf as cpu_scf
-from pyscf import df as cpu_df
-from pyscf.geomopt.geometric_solver import optimize
+from pyscf import df
 from gpu4pyscf import scf as gpu_scf
-from gpu4pyscf import df as gpu_df
-from gpu4pyscf.dft import rks
 from gpu4pyscf.df import int3c2e, df_jk
 
-lib.num_threads(8)
-
 atom='''
 Ti 0.0 0.0 0.0
 Cl 0.0 0.0 2.0
@@ -71,7 +63,7 @@ def test_vj_incore(self):
         vj_outcore = cupy.einsum('ijL,L->ij', int3c_gpu, rhoj_outcore)
         vj_incore = int3c2e.get_j_int3c2e_pass2(intopt, rhoj_incore)
         assert cupy.linalg.norm(vj_outcore - vj_incore) < 1e-5
-    
+
     def test_j_outcore(self):
         cupy.random.seed(1)
         nao = mol.nao
diff --git a/gpu4pyscf/dft/tests/test_dft.py b/gpu4pyscf/dft/tests/test_dft.py
index ff9ac469..ae27accd 100644
--- a/gpu4pyscf/dft/tests/test_dft.py
+++ b/gpu4pyscf/dft/tests/test_dft.py
@@ -26,8 +26,8 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 bas='def2-tzvpp'
-grids_level = 3
-nlcgrids_level = 1
+grids_level = 5
+nlcgrids_level = 2
 
 def setUpModule():
     global mol_sph, mol_cart
@@ -53,8 +53,9 @@ def tearDownModule():
     mol_cart.stdout.close()
     del mol_sph, mol_cart
 
-def run_dft(xc, mol):
-    mf = rks.RKS(mol_sph, xc=xc)
+def run_dft(xc, mol, disp=None):
+    mf = rks.RKS(mol, xc=xc)
+    mf.disp = disp
     mf.grids.level = grids_level
     mf.nlcgrids.level = nlcgrids_level
     e_dft = mf.kernel()
@@ -62,43 +63,70 @@ def run_dft(xc, mol):
 
 class KnownValues(unittest.TestCase):
     '''
-    known values are obtained by Q-Chem,         # def2-qzvpp
+    known values are obtained by Q-Chem
     '''
     def test_rks_lda(self):
         print('------- LDA ----------------')
         e_tot = run_dft("LDA, vwn5", mol_sph)
-        assert np.allclose(e_tot, -75.9046410402)# -75.9117117360)
+        e_ref = -75.9046410402
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE', mol_sph)
-        assert np.allclose(e_tot, -76.3800182418) #-76.3866453049)
+        e_ref = -76.3800182418
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_b3lyp(self):
         print('-------- B3LYP -------------')
         e_tot = run_dft('B3LYP', mol_sph)
-        assert np.allclose(e_tot, -76.4666495594) #-76.4728129216)
+        e_ref = -76.4666495594
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_m06(self):
         print('--------- M06 --------------')
         e_tot = run_dft("M06", mol_sph)
-        assert np.allclose(e_tot, -76.4265870634) #-76.4321318125)
+        e_ref = -76.4265870634
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_wb97(self):
         print('-------- wB97 --------------')
         e_tot = run_dft("HYB_GGA_XC_WB97", mol_sph)
-        assert np.allclose(e_tot, -76.4486274326) #-76.4543067064)
+        e_ref = -76.4486274326
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_vv10(self):
         print("------- wB97m-v -------------")
         e_tot = run_dft('HYB_MGGA_XC_WB97M_V', mol_sph)
-        assert np.allclose(e_tot, -76.4334218842) #-76.4391208632)
+        e_ref = -76.4334218842
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
     def test_rks_cart(self):
         print("-------- cart ---------------")
-        e_tot = run_dft('HYB_MGGA_XC_WB97M_V', mol_cart)
-        assert np.allclose(e_tot, -76.4334218842) #-76.4391208632)
-    #TODO: add test cases for D3/D4 and gradient
+        e_tot = run_dft('b3lyp', mol_cart)
+        e_ref = -76.4672144985
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
+
+    def test_rks_d3bj(self):
+        print('-------- B3LYP with d3bj -------------')
+        e_tot = run_dft('B3LYP', mol_sph, disp='d3bj')
+        e_ref = -76.4672233969
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-76.4728129216)
+
+    def test_rks_d4(self):
+        print('-------- B3LYP with d4 -------------')
+        e_tot = run_dft('B3LYP', mol_sph, disp='d4')
+        e_ref = -76.4669590803
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-76.4728129216)
 
 if __name__ == "__main__":
     print("Full Tests for dft")
diff --git a/gpu4pyscf/dft/tests/test_uks.py b/gpu4pyscf/dft/tests/test_uks.py
index 5e26db24..0b2139a1 100644
--- a/gpu4pyscf/dft/tests/test_uks.py
+++ b/gpu4pyscf/dft/tests/test_uks.py
@@ -46,8 +46,9 @@ def tearDownModule():
     mol.stdout.close()
     del mol
 
-def run_dft(xc):
+def run_dft(xc, disp=None):
     mf = uks.UKS(mol, xc=xc)
+    mf.disp = disp
     mf.grids.level = grids_level
     mf.nlcgrids.level = nlcgrids_level
     e_dft = mf.kernel()
@@ -60,35 +61,58 @@ class KnownValues(unittest.TestCase):
     def test_uks_lda(self):
         print('------- LDA ----------------')
         e_tot = run_dft("LDA, vwn5")
-        assert np.allclose(e_tot, -75.4231504131) #-75.42821982483972)
+        e_ref = -75.4231504131
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-75.42821982483972)
 
     def test_uks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE')
-        assert np.allclose(e_tot, -75.9128621398)# -75.91732813416843)
+        e_ref = -75.9128621398
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5# -75.91732813416843)
 
     def test_uks_b3lyp(self):
         print('-------- B3LYP -------------')
         e_tot = run_dft('B3LYP')
-        assert np.allclose(e_tot, -75.9987351592) #-76.00306439862237)
+        e_ref = -75.9987351592
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-76.00306439862237)
 
     def test_uks_m06(self):
         print('--------- M06 --------------')
         e_tot = run_dft("M06")
-        assert np.allclose(e_tot, -75.9609384616) #-75.96551006522827)
+        e_ref = -75.9609384616
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-75.96551006522827)
 
     def test_uks_wb97(self):
         print('-------- wB97 --------------')
         e_tot = run_dft("HYB_GGA_XC_WB97")
-        assert np.allclose(e_tot, -75.9833214499) #-75.987601337562)
+        e_ref = -75.9833214499
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-75.987601337562)
 
     def test_uks_vv10(self):
         print("------- wB97m-v -------------")
         e_tot = run_dft('HYB_MGGA_XC_WB97M_V')
-        assert np.allclose(e_tot, -75.9697577968)# -75.97363094678428)
+        e_ref = -75.9697577968
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5# -75.97363094678428)
 
-    #TODO: add test cases for D3/D4 and gradient
+    def test_uks_d3bj(self):
+        print('-------- B3LYP with D3BJ-------------')
+        e_tot = run_dft('B3LYP', disp='d3bj')
+        e_ref = -75.9993089249
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5#-76.00306439862237)
 
+    def test_uks_d4(self):
+        print('-------- B3LYP with D4 ------')
+        e_tot = run_dft('B3LYP', disp='d4')
+        e_ref = -75.9988910961
+        print('diff:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5#-76.00306439862237)
 if __name__ == "__main__":
     print("Full Tests for dft")
     unittest.main()
diff --git a/gpu4pyscf/grad/dispersion.py b/gpu4pyscf/grad/dispersion.py
index 72a24831..2b8799ea 100644
--- a/gpu4pyscf/grad/dispersion.py
+++ b/gpu4pyscf/grad/dispersion.py
@@ -19,7 +19,7 @@
 '''
 
 import numpy
-from gpu4pyscf.scf.hf import KohnShamDFT
+from gpu4pyscf import dft
 
 def get_dispersion(mf_grad, disp_version=None):
     '''gradient of dispersion correction for RHF/RKS'''
@@ -30,7 +30,7 @@ def get_dispersion(mf_grad, disp_version=None):
     if disp_version is None:
         return numpy.zeros([mol.natm,3])
 
-    if isinstance(mf_grad.base, KohnShamDFT):
+    if isinstance(mf_grad.base, dft.rks.KohnShamDFT):
         method = mf_grad.base.xc
     else:
         method = 'hf'
@@ -46,6 +46,8 @@ def get_dispersion(mf_grad, disp_version=None):
         from gpu4pyscf.lib import dftd4
         dftd4_model = dftd4.DFTD4Dispersion(mol, xc=method)
         res = dftd4_model.get_dispersion(grad=True)
+        print(method, disp_version)
+        print(res.get("gradient"))
         return res.get("gradient")
     else:
         raise RuntimeError(f'dispersion correction: {disp_version} is not supported.')
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 30eccbfb..ed5565e5 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -521,7 +521,7 @@ def calculate_h1e(h1_gpu, s1_gpu):
         t3 = log.timer_debug1('gradients of h1e', *t3)
 
         dvhf = mf_grad.get_veff(mol, dm0)
-        t4 = log.timer_debug1('gradients of veff', *t3)
+        log.timer_debug1('gradients of veff', *t3)
         log.debug('Computing Gradients of NR-HF Coulomb repulsion')
 
         dm0 = tag_array(dm0, mo_coeff=mo_coeff, mo_occ=mo_occ)
@@ -529,7 +529,7 @@ def calculate_h1e(h1_gpu, s1_gpu):
         for k, ia in enumerate(atmlst):
             extra_force[k] += mf_grad.extra_force(ia, locals())
 
-        t2 = log.timer_debug1('gradients of 2e part', *t3)
+        log.timer_debug1('gradients of 2e part', *t3)
 
     dh = contract('xij,ij->xi', h1, dm0)
     ds = contract('xij,ij->xi', s1, dme0)
diff --git a/gpu4pyscf/grad/tests/test_rhf_grad.py b/gpu4pyscf/grad/tests/test_rhf_grad.py
index c9c08747..e0a45e07 100644
--- a/gpu4pyscf/grad/tests/test_rhf_grad.py
+++ b/gpu4pyscf/grad/tests/test_rhf_grad.py
@@ -16,8 +16,8 @@
 import pyscf
 import numpy as np
 import unittest
-import gpu4pyscf
-from pyscf import scf
+from pyscf import scf as cpu_scf
+from gpu4pyscf import scf as gpu_scf
 
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -28,33 +28,62 @@
 bas0='cc-pvtz'
 
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def _check_grad(tol=1e-6):
-    mf = scf.hf.RHF(mol)
+def _check_grad(mol, tol=1e-6, disp=None):
+    mf = cpu_scf.hf.RHF(mol)
     mf.direct_scf_tol = 1e-10
+    mf.disp = disp
     mf.kernel()
 
-    cpu_gradient = pyscf.grad.RHF(mf)
+    cpu_gradient = mf.nuc_grad_method()
     g_cpu = cpu_gradient.kernel()
 
     gpu_gradient = cpu_gradient.to_gpu()
     g_gpu = gpu_gradient.kernel()
+    print('|| CPU - GPU ||:', np.linalg.norm(g_cpu - g_gpu))
     assert(np.linalg.norm(g_cpu - g_gpu) < tol)
 
 class KnownValues(unittest.TestCase):
     def test_grad_rhf(self):
-        _check_grad(tol=1e-6)
+        _check_grad(mol_sph, tol=1e-6)
+
+    def test_grad_cart(self):
+        _check_grad(mol_cart, tol=1e-6)
+
+    def test_grad_d3bj(self):
+        _check_grad(mol_sph, tol=1e-6, disp='d3bj')
+
+    def test_grad_d4(self):
+        _check_grad(mol_sph, tol=1e-6, disp='d4')
+
+    def test_to_cpu(self):
+        mf = gpu_scf.hf.RHF(mol_sph)
+        mf.direct_scf_tol = 1e-10
+        mf.disp = 'd3bj'
+        mf.kernel()
+
+        gpu_gradient = mf.nuc_grad_method()
+        g_gpu = gpu_gradient.kernel()
+        cpu_gradient = gpu_gradient.to_cpu()
+        g_cpu = cpu_gradient.kernel()
+        assert np.linalg.norm(g_gpu - g_cpu) < 1e-5
 
 if __name__ == "__main__":
-    print("Full Tests for Gradient")
+    print("Full Tests for RHF Gradient")
     unittest.main()
diff --git a/gpu4pyscf/grad/tests/test_rks_grad.py b/gpu4pyscf/grad/tests/test_rks_grad.py
index ad99c206..d7f6bffd 100644
--- a/gpu4pyscf/grad/tests/test_rks_grad.py
+++ b/gpu4pyscf/grad/tests/test_rks_grad.py
@@ -16,9 +16,10 @@
 import pyscf
 import cupy
 import unittest
-from pyscf.dft import rks
-import gpu4pyscf
-from gpu4pyscf.dft import numint
+import pytest
+from pyscf.dft import rks as cpu_rks
+from gpu4pyscf.dft import rks as gpu_rks
+from packaging import version
 
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -26,23 +27,32 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 
+pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
+
 bas0='def2-tzvpp'
 grids_level = 5
 nlcgrids_level = 3
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
 
-def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
-def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
-    mf = rks.RKS(mol, xc=xc)
+def tearDownModule():
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
+
+def _check_grad(mol, grid_response=False, xc='B3LYP', disp=None, tol=1e-6):
+    mf = cpu_rks.RKS(mol, xc=xc)
+    mf.disp = disp
     mf.direct_scf_tol = 1e-14
     mf.grids.level = grids_level
     mf.grids.prune = None
@@ -54,58 +64,72 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
     cpu_gradient.grid_response = grid_response
     g_cpu = cpu_gradient.kernel()
 
-    # TODO: use to_gpu functionality
-    mf.__class__ = gpu4pyscf.dft.rks.RKS
-    mf._numint = numint.NumInt()
-    mf.grids = gpu4pyscf.dft.gen_grid.Grids(mol)
-    mf.grids.level = grids_level
-    mf.grids.prune = None
-    mf.grids.small_rho_cutoff = 1e-30
-    mf.grids.build()
-    if mf._numint.libxc.is_nlc(mf.xc):
-        mf.nlcgrids = gpu4pyscf.dft.gen_grid.Grids(mol)
-        mf.nlcgrids.level = nlcgrids_level
-        mf.nlcgrids.build()
-
-    gpu_gradient = gpu4pyscf.grad.RKS(mf)
-    gpu_gradient.grid_response = grid_response
+    gpu_gradient = cpu_gradient.to_gpu()
     g_gpu = gpu_gradient.kernel()
+    print('|| CPU - GPU ||:', cupy.linalg.norm(g_cpu - g_gpu))
     assert(cupy.linalg.norm(g_cpu - g_gpu) < tol)
 
 class KnownValues(unittest.TestCase):
 
     def test_grad_with_grids_response(self):
         print("-----testing DFT gradient with grids response----")
-        _check_grad(grid_response=True, tol=1e-5)
+        _check_grad(mol_sph, grid_response=True, tol=1e-5)
 
     def test_grad_without_grids_response(self):
         print('-----testing DFT gradient without grids response----')
-        _check_grad(grid_response=False, tol=1e-5)
+        _check_grad(mol_sph, grid_response=False, tol=1e-5)
 
     def test_grad_lda(self):
         print("-----LDA testing-------")
-        _check_grad(xc='LDA', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='LDA', disp=None, tol=1e-5)
 
     def test_grad_gga(self):
         print('-----GGA testing-------')
-        _check_grad(xc='PBE', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='PBE', disp=None, tol=1e-5)
 
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
-        _check_grad(xc='B3LYP', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='B3LYP', disp=None, tol=1e-5)
 
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
-        _check_grad(xc='tpss', disp=None, tol=1e-4)
+        _check_grad(mol_sph, xc='tpss', disp=None, tol=1e-4)
 
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
-        _check_grad(xc='wb97', disp=None, tol=1e-4)
+        _check_grad(mol_sph, xc='wb97', disp=None, tol=1e-4)
 
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
-        _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_grad_d3bj(self):
+        print('--------- testing RKS with D3BJ ------')
+        _check_grad(mol_sph, xc='b3lyp', disp='d3bj', tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_grad_d4(self):
+        print('--------- testing RKS with D4 ------')
+        _check_grad(mol_sph, xc='b3lyp', disp='d4', tol=1e-5)
+
+    def test_grad_cart(self):
+        print('------hybrid GGA Cart testing--------')
+        _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_to_cpu(self):
+        mf = gpu_rks.RKS(mol_sph, xc='b3lyp')
+        mf.direct_scf_tol = 1e-10
+        mf.disp = 'd3bj'
+        mf.kernel()
+
+        gpu_gradient = mf.nuc_grad_method()
+        g_gpu = gpu_gradient.kernel()
+        cpu_gradient = gpu_gradient.to_cpu()
+        g_cpu = cpu_gradient.kernel()
+        assert cupy.linalg.norm(g_gpu - g_cpu) < 1e-5
 
 if __name__ == "__main__":
-    print("Full Tests for Gradient")
+    print("Full Tests for RKS Gradient")
     unittest.main()
diff --git a/gpu4pyscf/grad/tests/test_uhf_grad.py b/gpu4pyscf/grad/tests/test_uhf_grad.py
index 9e462c05..66ff1a93 100644
--- a/gpu4pyscf/grad/tests/test_uhf_grad.py
+++ b/gpu4pyscf/grad/tests/test_uhf_grad.py
@@ -16,8 +16,7 @@
 import pyscf
 import numpy as np
 import unittest
-import gpu4pyscf
-from pyscf import scf
+from gpu4pyscf import scf
 
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -28,34 +27,65 @@
 bas0='cc-pvtz'
 
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def _check_grad(tol=1e-6):
+def _check_grad(mol, tol=1e-6, disp=None):
     mf = scf.uhf.UHF(mol)
     mf.direct_scf_tol = 1e-10
+    mf.disp = disp
     mf.kernel()
 
-    cpu_gradient = pyscf.grad.UHF(mf)
-    g_cpu = cpu_gradient.kernel()
-
-    # TODO: use to_gpu function
-    mf.__class__ = gpu4pyscf.scf.uhf.UHF
-    gpu_gradient = gpu4pyscf.grad.UHF(mf)
+    gpu_gradient = mf.nuc_grad_method()
     g_gpu = gpu_gradient.kernel()
+
+    cpu_gradient = gpu_gradient.to_cpu()
+    g_cpu = cpu_gradient.kernel()
+    print('|| CPU - GPU ||:', np.linalg.norm(g_cpu - g_gpu))
     assert(np.linalg.norm(g_cpu - g_gpu) < tol)
 
 class KnownValues(unittest.TestCase):
-    def test_grad_uks(self):
-        _check_grad(tol=1e-6)
+    def test_grad_uhf(self):
+        print('---- testing UHF -------')
+        _check_grad(mol_sph, tol=1e-6)
+
+    def test_grad_cart(self):
+        print('---- testing UHF Cart -------')
+        _check_grad(mol_cart, tol=1e-6)
+
+    def test_grad_d3bj(self):
+        print('---- testing UHF with D3(BJ) ----')
+        _check_grad(mol_sph, tol=1e-6, disp='d3bj')
+
+    def test_grad_d4(self):
+        print('------- UHF with D4 -----')
+        _check_grad(mol_sph, tol=1e-6, disp='d4')
+
+    def test_to_cpu(self):
+        mf = scf.uhf.UHF(mol_sph)
+        mf.direct_scf_tol = 1e-10
+        mf.disp = 'd3bj'
+        mf.kernel()
+
+        gpu_gradient = mf.nuc_grad_method()
+        g_gpu = gpu_gradient.kernel()
+        cpu_gradient = gpu_gradient.to_cpu()
+        g_cpu = cpu_gradient.kernel()
+        assert np.linalg.norm(g_gpu - g_cpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for UHF Gradient")
diff --git a/gpu4pyscf/grad/tests/test_uks_grad.py b/gpu4pyscf/grad/tests/test_uks_grad.py
index d6dc5b7a..0ed0c111 100644
--- a/gpu4pyscf/grad/tests/test_uks_grad.py
+++ b/gpu4pyscf/grad/tests/test_uks_grad.py
@@ -14,11 +14,11 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import pyscf
-import cupy
+import numpy as np
 import unittest
-from pyscf.dft import uks
-import gpu4pyscf
-from gpu4pyscf.dft import numint
+import pytest
+from gpu4pyscf.dft import uks
+from packaging import version
 
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -26,23 +26,32 @@
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
 
+pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
+
 bas0='def2-tzvpp'
 grids_level = 5
 nlcgrids_level = 3
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1)
+    mol_sph.output = '/dev/null'
+    mol_sph.build()
+    mol_sph.verbose = 1
+
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, charge=1, spin=1, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.build()
+    mol_cart.verbose = 1
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
-def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
+def _check_grad(mol, grid_response=False, xc='B3LYP', disp=None, tol=1e-6):
     mf = uks.UKS(mol, xc=xc)
+    mf.disp = disp
     mf.direct_scf_tol = 1e-14
     mf.grids.level = grids_level
     mf.grids.prune = None
@@ -50,64 +59,75 @@ def _check_grad(grid_response=False, xc='B3LYP', disp='d3bj', tol=1e-6):
     if mf._numint.libxc.is_nlc(mf.xc):
         mf.nlcgrids.level = nlcgrids_level
     mf.kernel()
-    cpu_gradient = pyscf.grad.UKS(mf)
-    cpu_gradient.grid_response = grid_response
-    g_cpu = cpu_gradient.kernel()
-
-
-    # TODO: use to_gpu functionality
-    mf.__class__ = gpu4pyscf.dft.uks.UKS
-    mf._numint = numint.NumInt()
-    mf.grids = gpu4pyscf.dft.gen_grid.Grids(mol)
-    mf.grids.level = grids_level
-    mf.grids.prune = None
-    mf.grids.small_rho_cutoff = 1e-30
-    mf.grids.build()
-    if mf._numint.libxc.is_nlc(mf.xc):
-        mf.nlcgrids = gpu4pyscf.dft.gen_grid.Grids(mol)
-        mf.nlcgrids.level = nlcgrids_level
-        mf.nlcgrids.build()
-
-    gpu_gradient = gpu4pyscf.grad.UKS(mf)
+    gpu_gradient = mf.nuc_grad_method()
     gpu_gradient.grid_response = grid_response
     g_gpu = gpu_gradient.kernel()
-    print(g_cpu - g_gpu)
-    assert(cupy.linalg.norm(g_cpu - g_gpu) < tol)
+
+    cpu_gradient = gpu_gradient.to_cpu()
+    g_cpu = cpu_gradient.kernel()
+    print('|| CPU - GPU ||:', np.linalg.norm(g_cpu - g_gpu))
+    assert(np.linalg.norm(g_cpu - g_gpu) < tol)
 
 class KnownValues(unittest.TestCase):
 
     def test_grad_with_grids_response(self):
         print("-----testing unrestricted DFT gradient with grids response----")
-        _check_grad(grid_response=True, tol=1e-5)
+        _check_grad(mol_sph, grid_response=True, tol=1e-5)
 
     def test_grad_without_grids_response(self):
         print('-----testing unrestricted DFT gradient without grids response----')
-        _check_grad(grid_response=False, tol=1e-5)
+        _check_grad(mol_sph, grid_response=False, tol=1e-5)
 
     def test_grad_lda(self):
         print("-----LDA testing-------")
-        _check_grad(xc='LDA', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='LDA', disp=None, tol=1e-5)
 
     def test_grad_gga(self):
         print('-----GGA testing-------')
-        _check_grad(xc='PBE', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='PBE', disp=None, tol=1e-5)
 
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
-        _check_grad(xc='B3LYP', disp=None, tol=1e-5)
+        _check_grad(mol_sph, xc='B3LYP', disp=None, tol=1e-5)
 
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
-        _check_grad(xc='tpss', disp=None, tol=1e-4)
+        _check_grad(mol_sph, xc='tpss', disp=None, tol=1e-4)
 
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
-        _check_grad(xc='wb97', disp=None, tol=1e-4)
+        _check_grad(mol_sph, xc='wb97', disp=None, tol=1e-4)
 
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
-        _check_grad(xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5)
-
+        _check_grad(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None, tol=1e-5)
+
+    def test_grad_cart(self):
+        print('------hybrid GGA Cart testing--------')
+        _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_grad_d3bj(self):
+        print('------hybrid GGA with D3(BJ) testing--------')
+        _check_grad(mol_sph, xc='B3LYP', disp='d3bj', tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_grad_d4(self):
+        print('------hybrid GGA with D4 testing--------')
+        _check_grad(mol_sph, xc='B3LYP', disp='d4', tol=1e-5)
+
+    @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
+    def test_to_cpu(self):
+        mf = uks.UKS(mol_sph, xc='b3lyp')
+        mf.direct_scf_tol = 1e-10
+        mf.disp = 'd3bj'
+        mf.kernel()
+
+        gpu_gradient = mf.nuc_grad_method()
+        g_gpu = gpu_gradient.kernel()
+        cpu_gradient = gpu_gradient.to_cpu()
+        g_cpu = cpu_gradient.kernel()
+        assert np.linalg.norm(g_gpu - g_cpu) < 1e-5
 if __name__ == "__main__":
     print("Full Tests for UKS Gradient")
     unittest.main()
diff --git a/gpu4pyscf/grad/uhf.py b/gpu4pyscf/grad/uhf.py
index 3e95ad21..02e358b8 100644
--- a/gpu4pyscf/grad/uhf.py
+++ b/gpu4pyscf/grad/uhf.py
@@ -296,7 +296,7 @@ def calculate_h1e(h1_gpu, s1_gpu):
         t3 = log.init_timer()
         dh1e = int3c2e.get_dh1e(mol, dm0_sf)
 
-        t4 = log.timer_debug1("get_dh1e", *t3)
+        log.timer_debug1("get_dh1e", *t3)
         if mol.has_ecp():
             dh1e += rhf_grad.get_dh1e_ecp(mol, dm0_sf)
         t1 = log.timer_debug1('gradients of h1e', *t0)
@@ -306,14 +306,24 @@ def calculate_h1e(h1_gpu, s1_gpu):
         extra_force = cupy.zeros((len(atmlst),3))
         for k, ia in enumerate(atmlst):
             extra_force[k] += mf_grad.extra_force(ia, locals())
-
-        t2 = log.timer_debug1('gradients of 2e part', *t1)
+        log.timer_debug1('gradients of 2e part', *t1)
 
     dh = contract('xij,ij->xi', h1, dm0_sf)
     ds = contract('xij,ij->xi', s1, dme0_sf)
     delec = 2.0*(dh - ds)
     delec = cupy.asarray([cupy.sum(delec[:, p0:p1], axis=1) for p0, p1 in aoslices[:,2:]])
 
+
+    '''
+    print('dvhf')
+    print(dvhf)
+    print('dh1e')
+    print(dh1e)
+    print('delec')
+    print(delec)
+    print('extra')
+    print(extra_force)
+    '''
     de = 2.0 * dvhf + dh1e + delec + extra_force
 
     # for backward compatiability
diff --git a/gpu4pyscf/hessian/dispersion.py b/gpu4pyscf/hessian/dispersion.py
index 0dd68049..1567844d 100644
--- a/gpu4pyscf/hessian/dispersion.py
+++ b/gpu4pyscf/hessian/dispersion.py
@@ -19,7 +19,7 @@
 '''
 
 import numpy
-from gpu4pyscf.scf.hf import KohnShamDFT
+from gpu4pyscf import dft
 
 def get_dispersion(hessobj, disp_version=None):
     if disp_version is None:
@@ -30,7 +30,7 @@ def get_dispersion(hessobj, disp_version=None):
     h_disp = numpy.zeros([natm,natm,3,3])
     if disp_version is None:
         return h_disp
-    if isinstance(hessobj.base, KohnShamDFT):
+    if isinstance(hessobj.base, dft.rks.KohnShamDFT):
         method = hessobj.base.xc
     else:
         method = 'hf'
diff --git a/gpu4pyscf/lib/dftd4.py b/gpu4pyscf/lib/dftd4.py
index f361c604..b4a58938 100644
--- a/gpu4pyscf/lib/dftd4.py
+++ b/gpu4pyscf/lib/dftd4.py
@@ -33,7 +33,7 @@ class _d4_restype(ctypes.Structure):
 class DFTD4Dispersion(lib.StreamObject):
     def __init__(self, mol, xc, atm=False):
         coords = np.asarray(mol.atom_coords(), dtype=np.double, order='C')
-        charges = np.asarray(mol.atom_charges(), dtype=np.int32)
+        charge = np.array([mol.charge], dtype=np.double)
         nuc_types = [gto.charge(mol.atom_symbol(ia))
                      for ia in range(mol.natm)]
         nuc_types = np.asarray(nuc_types, dtype=np.int32)
@@ -47,7 +47,7 @@ def __init__(self, mol, xc, atm=False):
             ctypes.c_int(mol.natm),
             nuc_types.ctypes.data_as(ctypes.c_void_p),
             coords.ctypes.data_as(ctypes.c_void_p),
-            charges.ctypes.data_as(ctypes.c_void_p),
+            charge.ctypes.data_as(ctypes.c_void_p),
             self._lattice,
             self._periodic,
         )
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index 5ca2ec13..77ad2a46 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -92,6 +92,7 @@ def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
 
 info = lib.logger.info
 note = lib.logger.note
+warn = lib.logger.warn
 debug = lib.logger.debug
 debug1 = lib.logger.debug1
 debug2 = lib.logger.debug2
diff --git a/gpu4pyscf/lib/tests/test_to_gpu.py b/gpu4pyscf/lib/tests/test_to_gpu.py
index 7e485b89..f825be51 100644
--- a/gpu4pyscf/lib/tests/test_to_gpu.py
+++ b/gpu4pyscf/lib/tests/test_to_gpu.py
@@ -128,7 +128,7 @@ def test_df_RKS(self):
         h = hobj.kernel()
         assert numpy.abs(lib.fp(h) - 2.187025544697092) < 1e-4
 
-    # TODO: solvent
+
 
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/scf/dispersion.py b/gpu4pyscf/scf/dispersion.py
index 9443d5d7..0397d163 100644
--- a/gpu4pyscf/scf/dispersion.py
+++ b/gpu4pyscf/scf/dispersion.py
@@ -20,8 +20,8 @@
 '''
 
 
-import numpy
-from gpu4pyscf.scf.hf import KohnShamDFT
+from gpu4pyscf.scf import hf, uhf
+from gpu4pyscf.dft import rks, uks
 
 def get_dispersion(mf, disp_version=None):
     if disp_version is None:
@@ -29,7 +29,7 @@ def get_dispersion(mf, disp_version=None):
     mol = mf.mol
     if disp_version is None:
         return 0.0
-    if isinstance(mf, KohnShamDFT):
+    if isinstance(mf, rks.KohnShamDFT):
         method = mf.xc
     else:
         method = 'hf'
@@ -51,8 +51,6 @@ def get_dispersion(mf, disp_version=None):
         raise RuntimeError(f'dipersion correction: {disp_version} is not supported.')
 
 # Inject to SCF class
-from gpu4pyscf.scf import hf, uhf
-from gpu4pyscf.dft import rks, uks
 hf.RHF.get_dispersion = get_dispersion
 uhf.UHF.get_dispersion = get_dispersion
 rks.RKS.get_dispersion = get_dispersion
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index f05c3890..9f009f1d 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -87,7 +87,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
     dm_ctr_cond = np.max(
         [pyscf_lib.condense('absmax', x, l_ctr_ao_locs) for x in dms.get()], axis=0)
 
-    dm_shl = cupy.zeros([l_ctr_shell_locs[-1], l_ctr_shell_locs[-1]])
+    dm_shl = cupy.zeros([n_dm, l_ctr_shell_locs[-1], l_ctr_shell_locs[-1]])
     assert dms.flags.c_contiguous
     size_l = np.array([1,3,6,10,15,21,28])
     l_ctr = vhfopt.uniq_l_ctr[:,0]
@@ -101,13 +101,14 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
             j0 = l_ctr_ao_locs[j]
             j1 = l_ctr_ao_locs[j+1]
             nj_shls = (j1-j0)//size_l[lj]
-            sub_dm = dms[0][i0:i1,j0:j1].reshape([ni_shls, size_l[li], nj_shls, size_l[lj]])
-            dm_shl[r:r+ni_shls, c:c+nj_shls] = cupy.max(sub_dm, axis=[1,3])
+            for idm in range(n_dm):
+                sub_dm = dms[idm][i0:i1,j0:j1].reshape([ni_shls, size_l[li], nj_shls, size_l[lj]])
+                dm_shl[idm, r:r+ni_shls, c:c+nj_shls] = cupy.max(cupy.abs(sub_dm), axis=[1,3])
             c += nj_shls
         r += ni_shls
-
+    dm_shl = cupy.max(dm_shl, axis=0)
     dm_shl = cupy.log(dm_shl)
-    nshls = dm_shl.shape[0]
+    nshls = dm_shl.shape[1]
     if hermi != 1:
         dm_ctr_cond = (dm_ctr_cond + dm_ctr_cond.T) * .5
     fn = libgvhf.GINTbuild_jk
@@ -177,6 +178,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None,
             #           time.perf_counter() - t0)
             #print(li, lj, lk, ll, time.perf_counter() - t0)
             #exit()
+
     if with_j:
         vj_ao = []
         #vj = [cupy.einsum('pi,pq,qj->ij', coeff, x, coeff) for x in vj]
@@ -280,7 +282,6 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
                                 getattr(mf.opt, '_dmcondname', 'CVHFsetnr_direct_scf_dm'))
                 vhfopt.build(mf.direct_scf_tol)
                 mf._opt_gpu_omega = vhfopt
-
     vj, vk = get_jk(mol, dm, hermi, vhfopt, with_j, with_k, omega, verbose=log)
     log.timer('vj and vk on gpu', *cput0)
     return vj, vk
@@ -598,7 +599,7 @@ def check_sanity(self):
         logger.debug(self, 'cond(S) = %s', c)
         if cupy.max(c)*1e-17 > self.conv_tol:
             logger.warn(self, 'Singularity detected in overlap matrix (condition number = %4.3g). '
-                        'SCF may be inaccurate and hard to converge.', cupy.max(cond))
+                        'SCF may be inaccurate and hard to converge.', cupy.max(c))
         return super().check_sanity()
 
     build                    = hf.SCF.build
@@ -654,6 +655,7 @@ def reset(self, mol=None):
             self.mol = mol
         self._opt_gpu = None
         self._opt_gpu_omega = None
+        self.scf_summary = {}
         return self
 
 class KohnShamDFT:
diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py
index e64f5c68..365d0257 100644
--- a/gpu4pyscf/scf/tests/test_rhf.py
+++ b/gpu4pyscf/scf/tests/test_rhf.py
@@ -243,13 +243,28 @@ def test_get_k1_hermi0(self):
         mf1 = mf.to_cpu()
         refk = mf1.get_k(mol1, dm, hermi=0)
         self.assertAlmostEqual(abs(vk - refk).max(), 0, 7)
-    '''
+
     # end to end test
     def test_rhf_scf(self):
         e_tot = scf.RHF(mol).kernel()
-        self.assertAlmostEqual(e_tot, -151.08447712520285)
-    '''
+        e_ref = -151.08447712520285
+        assert np.abs(e_tot - e_ref) < 1e-5
+
+    def test_rhf_d3(self):
+        mf = scf.RHF(mol)
+        mf.disp = 'd3bj'
+        e_tot = mf.kernel()
+        e_ref = -151.1150439066
+        assert np.abs(e_tot - e_ref) < 1e-5
 
+    '''
+    def test_rhf_d4(self):
+        mf = scf.RHF(mol)
+        mf.disp = 'd4'
+        e_tot = mf.kernel()
+        e_ref = -151.08447712520285
+        assert np.abs(e_tot - e_ref) < 1e-5
+    '''
     # TODO:
     #test analyze
     #test mulliken_pop
diff --git a/gpu4pyscf/scf/tests/test_scf.py b/gpu4pyscf/scf/tests/test_scf.py
index 59882544..49d1c827 100644
--- a/gpu4pyscf/scf/tests/test_scf.py
+++ b/gpu4pyscf/scf/tests/test_scf.py
@@ -32,59 +32,95 @@
 '''
 bas='def2-qzvpp'
 def setUpModule():
-    global mol
-    mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
-    mol.output = '/dev/null'
-    mol.verbose = 0
-    mol.build()
+    global mol_sph, mol_cart
+    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.verbose = 0
+    mol_sph.build()
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=1)
+    mol_cart.output = '/dev/null'
+    mol_cart.verbose = 0
+    mol_cart.build()
 
 def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
+    global mol_sph, mol_cart
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart
 
 class KnownValues(unittest.TestCase):
     '''
     known values are obtained by Q-Chem
     '''
     def test_rhf(self):
-        mf = gpu_scf.RHF(mol)
-        mf.max_cycle = 10
+        mf = gpu_scf.RHF(mol_sph)
+        mf.max_cycle = 50
+        mf.conv_tol = 1e-9
+        e_tot = mf.kernel()
+        assert np.abs(e_tot - -76.0667232412) < 1e-5
+
+    def test_rhf_cart(self):
+        mf = gpu_scf.RHF(mol_cart)
+        mf.max_cycle = 50
+        mf.verbose = 5
         mf.conv_tol = 1e-9
         e_tot = mf.kernel()
-        assert np.allclose(e_tot, -76.0667232412)
+        assert np.abs(e_tot - -76.0668120924) < 1e-5
+
+    def test_uhf(self):
+        mf = gpu_scf.UHF(mol_cart)
+        mf.max_cycle = 50
+        mf.verbose = 5
+        mf.conv_tol = 1e-9
+        e_gpu = mf.kernel()
+
+        mf = mf.to_cpu()
+        e_cpu = mf.kernel()
+        assert np.abs(e_cpu - e_gpu) < 1e-5
+
+    def test_uhf_cart(self):
+        mf = gpu_scf.UHF(mol_cart)
+        mf.max_cycle = 50
+        mf.verbose = 5
+        mf.conv_tol = 1e-9
+        e_gpu = mf.kernel()
+
+        mf = mf.to_cpu()
+        e_cpu = mf.kernel()
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_to_cpu(self):
-        mf = gpu_scf.RHF(mol)
+        mf = gpu_scf.RHF(mol_sph)
         e_gpu = mf.kernel()
         mf = mf.to_cpu()
         e_cpu = mf.kernel()
         assert isinstance(mf, cpu_scf.hf.RHF)
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
-        mf = gpu_dft.rks.RKS(mol)
+        mf = gpu_dft.rks.RKS(mol_sph)
         e_gpu = mf.kernel()
         mf = mf.to_cpu()
         e_cpu = mf.kernel()
         assert isinstance(mf, cpu_dft.rks.RKS)
         assert 'gpu' not in mf.grids.__module__
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
     def test_to_gpu(self):
-        mf = cpu_scf.RHF(mol)
+        mf = cpu_scf.RHF(mol_sph)
         e_gpu = mf.kernel()
         mf = mf.to_gpu()
         e_cpu = mf.kernel()
         assert isinstance(mf, gpu_scf.hf.RHF)
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
-        mf = cpu_dft.rks.RKS(mol)
+        mf = cpu_dft.rks.RKS(mol_sph)
         e_gpu = mf.kernel()
         mf = mf.to_gpu()
         e_cpu = mf.kernel()
         assert isinstance(mf, gpu_dft.rks.RKS)
         assert 'gpu' in mf.grids.__module__
-        assert np.allclose(e_cpu, e_gpu)
+        assert np.abs(e_cpu - e_gpu) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for SCF")
diff --git a/gpu4pyscf/scf/tests/test_scf_ecp.py b/gpu4pyscf/scf/tests/test_scf_ecp.py
index 04765164..a1ece36e 100644
--- a/gpu4pyscf/scf/tests/test_scf_ecp.py
+++ b/gpu4pyscf/scf/tests/test_scf_ecp.py
@@ -41,7 +41,7 @@ def test_rhf(self):
         mf.max_cycle = 10
         mf.conv_tol = 1e-9
         e_tot = mf.kernel()
-        assert np.allclose(e_tot, -578.9674228876)
+        assert np.abs(e_tot - -578.9674228876) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for SCF")
diff --git a/gpu4pyscf/scf/tests/test_uhf.py b/gpu4pyscf/scf/tests/test_uhf.py
index 8a749115..3e2fd310 100644
--- a/gpu4pyscf/scf/tests/test_uhf.py
+++ b/gpu4pyscf/scf/tests/test_uhf.py
@@ -248,8 +248,30 @@ def test_get_k1_hermi0(self):
     # end to end test
     def test_uhf_scf(self):
         e_tot = scf.UHF(mol).kernel()
-        self.assertAlmostEqual(e_tot, -150.76441654065087)
+        e_ref = -150.76441654065087
+        print('--------- testing UHF -----------')
+        print('pyscf - qchem ', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
 
+    def test_uhf_d3bj(self):
+        mf = scf.UHF(mol)
+        mf.disp = 'd3bj'
+        e_tot = mf.kernel()
+        e_ref = -150.7949833081
+        print('--------- testing UHF with D3BJ ---')
+        print('pyscf - qchem ', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
+
+    '''
+    def test_uhf_d4(self):
+        mf = scf.UHF(mol)
+        mf.disp = 'd4'
+        e_tot = mf.kernel()
+        e_ref = -150.7604264160
+        print('--------- testing UHF with D4 ----')
+        print('pyscf - qchem ', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5
+    '''
     # TODO:
     #test analyze
     #test mulliken_pop
diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py
index 1566ab76..7928a601 100644
--- a/gpu4pyscf/scf/uhf.py
+++ b/gpu4pyscf/scf/uhf.py
@@ -210,8 +210,9 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
     det_ovlp           = NotImplemented
     make_asym_dm       = NotImplemented
     _finalize          = uhf.UHF._finalize
-    #screen_tol = 1e-14
-    #conv_tol_cpscf = 1e-3
+
+    screen_tol = 1e-14
+    conv_tol_cpscf = 1e-3
     DIIS = diis.SCF_DIIS
     #get_jk = _get_jk
 
@@ -221,7 +222,7 @@ def get_grad(self, mo_coeff, mo_occ, fock=None):
     density_fit = hf.RHF.density_fit
     energy_tot = hf.RHF.energy_tot
     energy_elec = energy_elec
-    
+
     make_rdm2 = NotImplemented
     dump_chk = NotImplemented
     newton = NotImplemented
diff --git a/gpu4pyscf/solvent/tests/test_pcm.py b/gpu4pyscf/solvent/tests/test_pcm.py
index f472f742..f548930e 100644
--- a/gpu4pyscf/solvent/tests/test_pcm.py
+++ b/gpu4pyscf/solvent/tests/test_pcm.py
@@ -15,8 +15,8 @@
 
 import unittest
 import numpy
-from pyscf import gto, df
-from gpu4pyscf import scf
+from pyscf import gto
+from gpu4pyscf import scf, dft
 from gpu4pyscf.solvent import pcm
 
 def setUpModule():
@@ -38,45 +38,64 @@ def tearDownModule():
     mol.stdout.close()
     del mol
 
-def _energy_with_solvent(method, unrestricted=False):
+def _energy_with_solvent(mf, method):
     cm = pcm.PCM(mol)
     cm.eps = epsilon
     cm.verbose = 0
     cm.lebedev_order = 29
     cm.method = method
-    if unrestricted:
-        mf = scf.RHF(mol).PCM(cm)
-    else:
-        mf = scf.RHF(mol).PCM(cm)
+    mf = mf.PCM(cm)
     e_tot = mf.kernel()
     return e_tot
 
 class KnownValues(unittest.TestCase):
     def test_CPCM(self):
-        e_tot = _energy_with_solvent('C-PCM')
+        e_tot = _energy_with_solvent(scf.RHF(mol), 'C-PCM')
         print(f"Energy error in RHF with C-PCM: {numpy.abs(e_tot - -74.9690902442)}")
         assert numpy.abs(e_tot - -74.9690902442) < 1e-9
 
     def test_COSMO(self):
-        e_tot = _energy_with_solvent('COSMO')
+        e_tot = _energy_with_solvent(scf.RHF(mol), 'COSMO')
         print(f"Energy error in RHF with COSMO: {numpy.abs(e_tot - -74.96900351922464)}")
         assert numpy.abs(e_tot - -74.96900351922464) < 1e-9
 
     def test_IEFPCM(self):
-        e_tot = _energy_with_solvent('IEF-PCM')
+        e_tot = _energy_with_solvent(scf.RHF(mol), 'IEF-PCM')
         print(f"Energy error in RHF with IEF-PCM: {numpy.abs(e_tot - -74.9690111344)}")
         assert numpy.abs(e_tot - -74.9690111344) < 1e-9
 
     def test_SSVPE(self):
-        e_tot = _energy_with_solvent('SS(V)PE')
+        e_tot = _energy_with_solvent(scf.RHF(mol), 'SS(V)PE')
         print(f"Energy error in RHF with SS(V)PE: {numpy.abs(e_tot - -74.9689577454)}")
         assert numpy.abs(e_tot - -74.9689577454) < 1e-9
 
     def test_uhf(self):
-        e_tot = _energy_with_solvent('IEF-PCM', unrestricted=True)
+        e_tot = _energy_with_solvent(scf.UHF(mol), 'IEF-PCM')
         print(f"Energy error in UHF with IEF-PCM: {numpy.abs(e_tot - -74.96901113434953)}")
         assert numpy.abs(e_tot - -74.96901113434953) < 1e-9
 
+    def test_rks(self):
+        e_tot = _energy_with_solvent(dft.RKS(mol, xc='b3lyp'), 'IEF-PCM')
+        print(f"Energy error in RKS with IEF-PCM: {numpy.abs(e_tot - -75.3182692148)}")
+        assert numpy.abs(e_tot - -75.3182692148) < 1e-6
+
+    def test_uks(self):
+        e_tot = _energy_with_solvent(dft.UKS(mol, xc='b3lyp'), 'IEF-PCM')
+        print(f"Energy error in UKS with IEF-PCM: {numpy.abs(e_tot - -75.3182692148)}")
+        assert numpy.abs(e_tot - -75.3182692148) < 1e-6
+
+    def test_dfrks(self):
+        e_tot = _energy_with_solvent(dft.RKS(mol, xc='b3lyp').density_fit(), 'IEF-PCM')
+        print(e_tot)
+        print(f"Energy error in DFRKS with IEF-PCM: {numpy.abs(e_tot - -75.31863727142068)}")
+        assert numpy.abs(e_tot -  -75.31863727142068) < 1e-9
+
+    def test_dfuks(self):
+        e_tot = _energy_with_solvent(dft.UKS(mol, xc='b3lyp').density_fit(), 'IEF-PCM')
+        print(e_tot)
+        print(f"Energy error in DFUKS with IEF-PCM: {numpy.abs(e_tot - -75.31863727142068)}")
+        assert numpy.abs(e_tot - -75.31863727142068) < 1e-9
+
 if __name__ == "__main__":
     print("Full Tests for PCMs")
     unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/tests/test_smd.py b/gpu4pyscf/solvent/tests/test_smd.py
index c42b4ff1..32bb62e1 100644
--- a/gpu4pyscf/solvent/tests/test_smd.py
+++ b/gpu4pyscf/solvent/tests/test_smd.py
@@ -15,8 +15,8 @@
 
 import unittest
 import numpy
-from pyscf import gto, df
-from gpu4pyscf import scf
+from pyscf import gto
+from gpu4pyscf import scf, dft
 from gpu4pyscf.solvent import smd
 
 def setUpModule():
@@ -75,7 +75,38 @@ def test_uhf(self):
         mf.with_solvent.sasa_ng = 590
         e_tot = mf.kernel()
         assert numpy.abs(e_tot - -76.07550951172617) < 2e-4
-    # TODO: add more test for other molecules
+
+    def test_rks(self):
+        mf = dft.RKS(mol, xc='b3lyp')
+        mf = mf.SMD()
+        mf.with_solvent.solvent = 'water'
+        mf.with_solvent.sasa_ng = 590
+        e_tot = mf.kernel()
+        assert numpy.abs(e_tot - -76.478626548) < 2e-4
+
+    def test_uks(self):
+        mf = dft.UKS(mol, xc='b3lyp')
+        mf = mf.SMD()
+        mf.with_solvent.solvent = 'water'
+        mf.with_solvent.sasa_ng = 590
+        e_tot = mf.kernel()
+        assert numpy.abs(e_tot - -76.478626548) < 2e-4
+
+    def test_dfrks(self):
+        mf = dft.RKS(mol, xc='b3lyp').density_fit()
+        mf = mf.SMD()
+        mf.with_solvent.solvent = 'water'
+        mf.with_solvent.sasa_ng = 590
+        e_tot = mf.kernel()
+        assert numpy.abs(e_tot - -76.47848839552529) < 1e-4
+
+    def test_dfuks(self):
+        mf = dft.UKS(mol, xc='b3lyp').density_fit()
+        mf = mf.SMD()
+        mf.with_solvent.solvent = 'water'
+        mf.with_solvent.sasa_ng = 590
+        e_tot = mf.kernel()
+        assert numpy.abs(e_tot - -76.47848839552529) < 1e-4
 
 if __name__ == "__main__":
     print("Full Tests for SMDs")