diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 7f2b816e..29ec300f 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -14,7 +14,7 @@ permissions:
 jobs:
   build:
 
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux, X64, v100]
 
     steps:
     - uses: actions/checkout@v3
@@ -23,6 +23,7 @@ jobs:
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pytest-benchmark
         pip3 install pyscf --upgrade
         pip3 install numpy --upgrade
         pip3 install scipy --upgrade
@@ -35,8 +36,13 @@ jobs:
         export PATH=${CUDA_HOME}/bin:${PATH}
         export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
         sh build.sh
-    - name: Smoke Test
+    - name: Test RKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest --durations=0
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test UKS
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 4eb534e3..12464ab5 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -21,6 +21,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -38,7 +39,7 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
@@ -48,6 +49,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -65,4 +67,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
diff --git a/.gitignore b/.gitignore
index 427ffd8a..b8dd78e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 **/build
 **/launch_logs
 **/deps
+**/.benchmarks
 core
 **tmp*
 *.egg-info/
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 00000000..7f747686
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,98 @@
+v1.3.0 (2025-01-07)
+-------------------
+* New Features
+  - PBC analytical Fourier transform on GPU
+* Improvements
+  - Optimized computation efficiency and memory footprint for density fitting Hessian
+  - Support pickle serialization for most classes (SCF, DF, PCM, etc.)
+  - Efficiency of moving CuPy arrays between GPU cards
+
+
+v1.2.1 (2024-12-20)
+-------------------
+* New Features
+  - Change the license from GPL v3.0 to Apache 2.0
+  - Multi-GPU support for SCF, Gradients, and Hessian computation using AO-direct algorithm
+  - Add PBC HF and DFT with k-points, UHF/UKS, and density fitting
+* Improvements
+  - Change the default conv_tol_cpscf = 1e-3 / batch of atoms to conv_tol_cpscf = 1e-6 / atom
+  - Fix numerical instability in complex-valued TDHF diagonalization
+  - Improve PCM and QMMM with int1e_grids kernel
+  - Support non-symmetric int3c2e integral
+  - Optimize Hessian calculation with direct SCF
+  - Improve the numerical stability of int3c2e for point charge
+  - Add CI workflow for multi-GPU
+* Fixes
+  - Fix non-contiguous array error in p2p transfer between GPUs.
+  - Fix bugs in NMR calculations
+
+
+v1.2.0 (2024-12-09)
+-------------------
+* New Features
+  - Spin-conserved TDA and TDDFT methods
+  - Spin-flip TDA method.
+  - J-engine using McMuchie-Davidson integral algorithm
+  - Support multi-GPU density fitting energy, gradients and Hessian computation.
+  - Second order SCF solver
+* Improvements
+  - Support non-hermitian density matrix in J/K builder
+  - Secondary grids for CPHF solver
+  - 3-center integral computation efficiency for gradients and hessian
+  - One-electron Coulomb integrals against point charges and Gaussian charge distributions on grids.
+  - Automatically apply SCF initial guess from existing wavefunction
+
+
+v1.1.0 (2024-10-29)
+-------------------
+* New Features
+  - Add esp charge and resp charge by @wxj6000 in #208
+  - New Rys kernel by @sunqm in #221
+  - Optimize nuclear gradients using new Rys kernel by @sunqm in #224
+  - GPU kernel for analytical hessian by @sunqm in #227
+  - Add QM/MM by @MoleOrbitalHybridAnalyst in #218
+* Improvements
+  - Improved compatiability with pyscf 2.7.0 by @wxj6000 in #216
+  - Add skipping SCF cycles by @kvkarandashev in #229
+  - Skip building gint, gvhf, ... when building libxc by @wxj6000 in #210
+* Bugfix
+  - Typo in build_wheels.sh by @wxj6000 in #209
+  - Typo in dft_driver.py by @wxj6000 in #220
+  - Bugfix: cusolver error when specifying gpu by @wxj6000 in #213
+  - Bugfix: error in int2c2e by @wxj6000 in #212
+  - Bugfix: inconsistent gradient with CPU. Improved to_cpu, uks gradient, and grid_response by @wxj6000 in #230
+  - Bugfix: recompute int3c2e in DF UHF by @wxj6000 in #226
+  - New Contributors
+  - @MoleOrbitalHybridAnalyst made their first contribution in #218
+  - @kvkarandashev made their first contribution in #229
+
+
+v1.0.2 (2024-09-03)
+-------------------
+* Bugfix: append data in h5 file by @wxj6000 in #200
+* Support customized CHELPG radii by @wxj6000 in #202
+* Add cupy installation guide for developer installation instructions by @henryw7 in #204
+* Bugfix: save density when spin unrestricted by @wxj6000 in #205
+* Add chkfile support for pysisyphus by @henryw7 in #203
+
+
+v1.0.1 (2024-08-24)
+-------------------
+* Bugfix in rks.reset by @wxj6000 in #191. The bug leads to the failure of geometry optimization with direct SCF (#190)
+* Bugfix when CUDA unified memory is disabled. Removed CUDA unified memory in libxc, and reduced the overhead in calling libxc @wxj6000 in #180, #189
+* Bugfix and Improvement in opt_driver by @wxj6000 in #187 #197
+* Support SMD in opt_driver and dft driver @liuyu-chem1996 in #196
+* Support thermo calculation in dft_driver @liuyu-chem1996 in #192
+
+
+v1.0.0 (2024-07-23)
+-------------------
+Released features:
+* Density fitting scheme and direct SCF scheme
+* SCF, analytical gradient, and analytical Hessian calculations for Hartree-Fock and DFT
+* Spin-conserved and spin-flip TDA and TDDFT for excitated states
+* Nonlocal functional correction (vv10) for SCF and gradient
+* PCM models, SMD model, their analytical gradients, and semi-analytical Hessian matrix
+* Unrestricted Hartree-Fock and unrestricted DFT, gradient, and Hessian
+* MP2/DF-MP2 and CCSD (experimental)
+* Polarizability, IR, and NMR shielding (experimental)
diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
new file mode 100644
index 00000000..8455f3f0
--- /dev/null
+++ b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -0,0 +1,141 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import cupy as cp
+from cupyx import profiler
+from gpu4pyscf.lib.cupy_helper import copy_array
+
+'''
+Benchmark different ways of transfering data from pinned memory to device
+'''
+
+# Host array
+host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8)
+big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array)
+big_host_data = big_host_data.reshape(512,512,512)
+big_host_data += np.random.rand(512,512,512)
+
+# Device array
+big_device_data = cp.empty_like(big_host_data)
+
+# Create views on both arrays
+host_view = big_host_data[:, 128:]  # Non-contiguous view on the host
+device_view = big_device_data[:, 128:]  # Non-contiguous view on the device
+
+print("Host View Shape:", host_view.shape)
+print("Device View Shape:", device_view.shape)
+
+print("------ Benchmark device to host transfer ----------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = cp.asarray(c)
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("------- Benchmark host to device transfer ---------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = c.get()
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (non-contiguous) ---------")
+
+with cp.cuda.Device(0):
+    a = cp.random.rand(512,512,512)
+    device0_view = a[:,128:]
+with cp.cuda.Device(1):
+    b = cp.random.rand(512,512,512)
+    device1_view = b[:,128:]
+perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10
+
+def cupy_copy(c, out):
+    with cp.cuda.Device(out.device):
+        out[:] = cp.asarray(c.get())
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (contiguous) ---------")
+perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy_contiguous(a, b):
+    b[:] = a
+perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy copy contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_asarray_contiguous(a, b):
+    with cp.cuda.Device(b.device):
+        b = cp.asarray(a) 
+perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(a.get() - b.get()) < 1e-10
+
+
+print('----------- Benchmark reduction across devices ------ ')
+from gpu4pyscf.lib.cupy_helper import reduce_to_device
+_num_devices = cp.cuda.runtime.getDeviceCount()
+a_dist = []
+for device_id in range(_num_devices):
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(512,512,512)
+        a_dist.append(a)
+
+perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 62518557..58c9076e 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -36,12 +36,12 @@
     atom=atom,                         # water molecule
     basis='def2-tzvpp',                # basis set
     output='./pyscf.log',              # save log file
-    verbose=6                         # control the level of print info
+    verbose=6                          # control the level of print info
     )
 
 mf_GPU = rks.RKS(                      # restricted Kohn-Sham DFT
     mol,                               # pyscf.gto.object
-    xc='b3lyp'                        # xc funtionals, such as pbe0, wb97m-v, tpss,
+    xc='b3lyp'                         # xc funtionals, such as pbe0, wb97m-v, tpss,
     ).density_fit()                    # density fitting
 
 mf_GPU.grids.atom_grid = (99,590)      # (99,590) lebedev grids, (75,302) is often enough
@@ -51,7 +51,7 @@
 
 # Compute Energy
 e_dft = mf_GPU.kernel()
-print(f"total energy = {e_dft}") # -76.26736519501688
+print(f"total energy = {e_dft}")       # -76.46668196729536
 
 # Compute Gradient
 g = mf_GPU.nuc_grad_method()
diff --git a/examples/02-h2o_geomopt.py b/examples/02-h2o_geomopt.py
index 1ca982a9..eaadbc26 100644
--- a/examples/02-h2o_geomopt.py
+++ b/examples/02-h2o_geomopt.py
@@ -43,4 +43,4 @@ def callback(envs):
 mol_eq = optimize(mf_GPU, maxsteps=20, callback=callback)
 print("Optimized coordinate:")
 print(mol_eq.atom_coords())
-print('geometry optimization took', time.time() - start_time, 's')
+print('Geometry optimization took', time.time() - start_time, 's')
diff --git a/examples/04-h2o_esp.py b/examples/04-h2o_esp.py
index 9b04c485..264b3685 100644
--- a/examples/04-h2o_esp.py
+++ b/examples/04-h2o_esp.py
@@ -21,6 +21,7 @@
 import numpy as np
 from pyscf import gto
 from gpu4pyscf.dft import rks
+from gpu4pyscf.gto.int3c1e import int1e_grids
 
 atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -33,10 +34,8 @@
 mf.kernel()
 dm = mf.make_rdm1()  # compute one-electron density matrix
 
-# Use default mesh grids
-coords = mf.grids.coords.get()
+# Use default Lebedev grids
+coords = mf.grids.coords
 
-# The efficiency can be improved if needed
-from pyscf import df
-fakemol = gto.fakemol_for_charges(coords)
-v = np.einsum('ijp,ij->p', df.incore.aux_e2(mol, fakemol), dm)
+# Calculate electrostatic potential
+v = int1e_grids(mol, coords, dm=dm) # performing 'ijp,ij->p' efficiently
diff --git a/examples/05-h2o_multipole_moment.py b/examples/05-h2o_multipole_moment.py
index e360d859..1ea7c677 100644
--- a/examples/05-h2o_multipole_moment.py
+++ b/examples/05-h2o_multipole_moment.py
@@ -32,10 +32,10 @@
 mf.kernel()
 dm = mf.make_rdm1()
 
-dip = mf.dip_moment(unit='DEBYE', dm=dm.get())
+dip = mf.dip_moment(unit='DEBYE', dm=dm)
 print('dipole moment:')
 print(dip)
 
-quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm.get())
+quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm)
 print('quadrupole moment:')
 print(quad)
diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py
index 3fb05d4e..00ea6054 100644
--- a/examples/14-pcm_solvent.py
+++ b/examples/14-pcm_solvent.py
@@ -31,9 +31,9 @@
 mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
 mf = mf.PCM()
 mf.grids.atom_grid = (99,590)
-mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
-mf.with_solvent.method = 'IEF-PCM'
-mf.with_solvent.eps = 78.3553
+mf.with_solvent.lebedev_order = 29  # 302 Lebedev grids
+mf.with_solvent.method = 'IEF-PCM'   # Can be C-PCM, SS(V)PE, COSMO
+mf.with_solvent.eps = 78.3553        # Dielectric constant
 mf.kernel()
 
 gradobj = mf.nuc_grad_method()
diff --git a/examples/15-chelpg.py b/examples/15-chelpg.py
index 75161162..8e94d92a 100644
--- a/examples/15-chelpg.py
+++ b/examples/15-chelpg.py
@@ -32,18 +32,18 @@
 mol.basis = '631g'
 mol.unit = 'B'
 mol.build()
-mol.verbose = 6
+mol.verbose = 4
 
 xc = 'b3lyp'
 mf = rks.RKS(mol, xc=xc)
 mf.grids.level = 5
 mf.kernel()
 q = chelpg.eval_chelpg_layer_gpu(mf)
-print('partial charge with CHELPG, using modified Bondi radii')
+print('Partial charge with CHELPG, using modified Bondi radii')
 print(q) # [ 0.04402311  0.11333945 -0.25767919  0.10031663]
 
 # Customize the radii used for calculating CHELPG charges
 from pyscf.data import radii
 q = chelpg.eval_chelpg_layer_gpu(mf, Rvdw=radii.UFF)
-print('partial charge with CHELPG, using UFF radii')
+print('Partial charge with CHELPG, using UFF radii')
 print(q)
diff --git a/examples/16-smd_solvent.py b/examples/16-smd_solvent.py
index e606d74a..446fe38c 100644
--- a/examples/16-smd_solvent.py
+++ b/examples/16-smd_solvent.py
@@ -28,16 +28,14 @@
 
 mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=1)
 mf = dft.rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
-mf = mf.SMD()
 mf.grids.atom_grid = (99,590)
-mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
-mf.with_solvent.method = 'SMD'
-mf.with_solvent.solvent = 'water'
-e_tot = mf.kernel()
-print('total energy with SMD:', e_tot)
+e_gas = mf.kernel()
+print('total energy in gas phase:', e_gas)
 
-gradobj = mf.nuc_grad_method()
-f = gradobj.kernel()
+mf = mf.SMD()   # Add SMD model to the mean-field object
+mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids,
+mf.with_solvent.solvent = 'water' # Has to be a string, lookup the solvent name from https://comp.chem.umn.edu/solvation/mnsddb.pdf
+e_smd = mf.kernel()
+print('total energy in water:', e_smd)
 
-hessobj = mf.Hessian()
-h = hessobj.kernel()
+print('Solvation free energy:', e_smd - e_gas)
diff --git a/examples/19-unrestricted_dft.py b/examples/19-unrestricted_dft.py
index 86e59402..0ebaec5b 100644
--- a/examples/19-unrestricted_dft.py
+++ b/examples/19-unrestricted_dft.py
@@ -49,14 +49,3 @@
 
 hobj_with_pcm = mf_with_pcm.Hessian()
 h = hobj_with_pcm.kernel()
-
-# SCF, gradient, and Hessian for DF-UKS with IEF-PCM
-mf_with_smd = mf.SMD()
-mf_with_smd.with_solvent.solvent = 'water'
-mf_with_smd.kernel()
-
-gobj_with_smd = mf_with_smd.nuc_grad_method()
-g = gobj_with_smd.kernel()
-
-hobj_with_smd = mf_with_smd.Hessian()
-h = hobj_with_smd.kernel()
diff --git a/examples/20-dfmp2.py b/examples/20-dfmp2.py
index e00c9b78..6edfc100 100644
--- a/examples/20-dfmp2.py
+++ b/examples/20-dfmp2.py
@@ -35,7 +35,18 @@
 e_corr, t2 = ptobj.kernel()
 e_mp2 = e_hf + e_corr
 
+# It prints out MP2 energies, those energies are assessible in the PT object.
+print('MP2 correlation energy:', ptobj.emp2)
+print('SCS MP2 correlation energy:', ptobj.emp2_scs)
+print('Total energy with SCS MP2:', ptobj.e_tot_scs)
+
+print('----- frozen core --------')
+
 # frozen core
 ptobj.frozen = [0]
 e_corr, t2 = ptobj.kernel()
 e_mp2 = e_hf + e_corr
+
+print('MP2 correlation energy:', ptobj.emp2)
+print('SCS MP2 correlation energy:', ptobj.emp2_scs)
+print('Total energy with SCS MP2:', ptobj.e_tot_scs)
diff --git a/examples/22-resp_charge.py b/examples/22-resp_charge.py
index 208adc27..7e83d290 100644
--- a/examples/22-resp_charge.py
+++ b/examples/22-resp_charge.py
@@ -42,11 +42,11 @@
 print(q0)
 
 # RESP charge // first stage fitting
-q1 = esp.resp_solve(mol, dm)    
+q1 = esp.resp_solve(mol, dm)
 
-# Add constraint: fix those charges in the second stage 
+# Add constraint: fix those charges in the second stage
 # q2[4] = q1[4]
-# q2[5] = q1[5] 
+# q2[5] = q1[5]
 # q2[6] = q1[6]
 # q2[7] = q1[7]
 sum_constraints = []
@@ -58,7 +58,7 @@
 equal_constraints = [[1,2,3]]
 
 # RESP charge // second stage fitting
-q2 = esp.resp_solve(mol, dm, resp_a=1e-3, 
+q2 = esp.resp_solve(mol, dm, resp_a=1e-3,
                     sum_constraints=sum_constraints,
                     equal_constraints=equal_constraints)
 print('Fitted RESP charge')
diff --git a/examples/23-qmmm_pbc.py b/examples/24-qmmm_pbc.py
similarity index 100%
rename from examples/23-qmmm_pbc.py
rename to examples/24-qmmm_pbc.py
diff --git a/examples/24-cp_bsse.py b/examples/25-cp_bsse.py
similarity index 86%
rename from examples/24-cp_bsse.py
rename to examples/25-cp_bsse.py
index 45a2c845..697cf8bc 100644
--- a/examples/24-cp_bsse.py
+++ b/examples/25-cp_bsse.py
@@ -21,15 +21,15 @@
 from gpu4pyscf.dft import rks
 
 atom_A = [
-('O', (0.000000, 0.000000, 0.000000)),
-('H', (0.000000, 0.757160, 0.586260)),
-('H', (0.000000, -0.757160, 0.586260))
+    ('O', (0.000000, 0.000000, 0.000000)),
+    ('H', (0.000000, 0.757160, 0.586260)),
+    ('H', (0.000000, -0.757160, 0.586260))
 ]
 
 atom_B = [
-('O', (0.000000, 0.000000, 2.913530)),
-('H', (0.000000, 0.757160, 3.499790)),
-('H', (0.000000, -0.757160, 3.499790))
+    ('O', (0.000000, 0.000000, 2.913530)),
+    ('H', (0.000000, 0.757160, 3.499790)),
+    ('H', (0.000000, -0.757160, 3.499790))
 ]
 
 atom_AB = atom_A + atom_B
@@ -51,7 +51,7 @@
 mol_B_ghost.build()
 
 def solve_dft(mol, xc='b3lyp'):
-    mf = rks.RKS(mol, xc='b3lyp').density_fit()
+    mf = rks.RKS(mol, xc=xc).density_fit()
     mf.grids.atom_grid = (99,590)
     return mf.kernel()
 
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 13aaa0ce..0be7f410 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -27,7 +27,6 @@
 parser.add_argument("--solvent",      type=str,  default='')
 args = parser.parse_args()
 
-lib.num_threads(16)
 start_time = time.time()
 bas = args.basis
 mol = pyscf.M(
@@ -52,7 +51,7 @@
 mf_df.direct_scf_tol = 1e-14
 mf_df.conv_tol = 1e-10
 mf_df.chkfile = None
-mf_df.conv_tol_cpscf = 1e-3
+mf_df.conv_tol_cpscf = 1e-6
 e_tot = mf_df.kernel()
 scf_time = time.time() - start_time
 print(f'compute time for energy: {scf_time:.3f} s')
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 4526d79d..b823b43b 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '1.2.1'
+__version__ = '1.3.0'
 
 from . import lib, grad, hessian, solvent, scf, dft
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 52b0ecf8..da61804c 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -20,7 +20,8 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
+from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, 
+                                       cart2sph, p2p_transfer, copy_array)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -36,7 +37,7 @@
 class DF(lib.StreamObject):
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    _keys = {'intopt', 'mol', 'auxmol', 'use_gpu_memory'}
+    _keys = {'intopt', 'nao', 'naux', 'cd_low', 'mol', 'auxmol', 'use_gpu_memory'}
 
     def __init__(self, mol, auxbasis=None):
         self.mol = mol
@@ -52,8 +53,12 @@ def __init__(self, mol, auxbasis=None):
         self.naux = None
         self.cd_low = None
         self._cderi = None
+        self._vjopt = None
         self._rsh_df = {}
 
+    __getstate__, __setstate__ = lib.generate_pickle_methods(
+        excludes=('cd_low', 'intopt', '_cderi', '_vjopt'))
+
     @property
     def auxbasis(self):
         return self._auxbasis
@@ -138,8 +143,7 @@ def get_blksize(self, extra=0, nao=None):
         log = logger.new_logger(self.mol, self.mol.verbose)
         device_id = cupy.cuda.Device().id
         log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}")
-        if blksize < ALIGNED:
-            raise RuntimeError("Not enough GPU memory")
+        assert blksize > 0
         return blksize
 
     def loop(self, blksize=None, unpack=True):
@@ -222,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         log.debug("Saving CDERI on CPU")
 
     _cderi = {}
-    blksize = (naux + _num_devices - 1) // _num_devices
-    for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
+    aux_blksize = (naux + _num_devices - 1) // _num_devices
+    aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
+    for device_id in range(_num_devices):
+        p0 = min(aux_blksize*device_id, naux)
+        p1 = min(aux_blksize*(device_id+1), naux)
+        #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
         if use_gpu_memory:
             with cupy.cuda.Device(device_id), _streams[device_id]:
                 _cderi[device_id] = cupy.empty([p1-p0, npairs])
-            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}")
+            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
         else:
             mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8)
             cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem)
@@ -249,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             task_list = task_list_per_device[device_id]
-            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
+            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
             futures.append(future)
 
@@ -261,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     return _cderi
 
-def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
+def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, 
+                omega=None, sr_only=False, device_id=0):
     ''' Execute CDERI tasks on one device
     '''
     nq = len(intopt.log_qs)
@@ -270,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
     naoaux = cd_low.shape[0]
     npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
     pairs_loc = np.append(0, np.cumsum(npairs))
-    blksize = (naux + _num_devices - 1) // _num_devices
     with cupy.cuda.Device(device_id), _streams[device_id]:
         assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
@@ -341,13 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             ij0 = pairs_loc[cp_ij_id]
             ij1 = pairs_loc[cp_ij_id+1]
             if isinstance(_cderi[0], np.ndarray):
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    for i in range(p0,p1):
-                        cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
+            elif _num_devices > 1:
+                # Multi-GPU case, copy data to other Devices
+                for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
+                    # Making a copy for contiguous data transfer
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    with cupy.cuda.Device(dev_id):
+                        tmp = copy_array(tmp)
+                        _cderi[dev_id][:,ij0:ij1] = tmp
             else:
-                # Copy data to other Devices
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
-                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
-            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
+                _cderi[0][:,ij0:ij1] = cderi_block
+            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index d2083f41..5561cf9c 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -122,7 +122,7 @@ class _DFHF:
     to_gpu = utils.to_gpu
     device = utils.device
     __name_mixin__ = 'DF'
-    _keys = {'rhoj', 'rhok', 'disp', 'screen_tol'}
+    _keys = {'rhoj', 'rhok', 'disp', 'screen_tol', 'with_df', 'only_dfj'}
 
     def __init__(self, mf, dfobj, only_dfj):
         self.__dict__.update(mf.__dict__)
@@ -132,7 +132,6 @@ def __init__(self, mf, dfobj, only_dfj):
         self.direct_scf = False
         self.with_df = dfobj
         self.only_dfj = only_dfj
-        self._keys = mf._keys.union(['with_df', 'only_dfj'])
 
     def undo_df(self):
         '''Remove the DFHF Mixin'''
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
index 4139726e..2bbf9d9e 100644
--- a/gpu4pyscf/df/grad/jk.py
+++ b/gpu4pyscf/df/grad/jk.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from concurrent.futures import ThreadPoolExecutor
+import numpy as np
 import cupy
-from gpu4pyscf.lib.cupy_helper import contract, concatenate
+from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
+from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 
@@ -54,7 +56,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
         t0 = log.timer_debug1(f'rhoj and rhok on Device {device_id}', *t0)
     return rhoj, rhok
 
-def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
+def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
     ''' Calculate rhoj and rhok on Multi-GPU system
     '''
     futures = []
@@ -80,3 +82,112 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
         rhok = concatenate(rhok_total)
 
     return rhoj, rhok
+
+def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list,
+                with_j=True, with_k=True, device_id=0, omega=None):
+    mol = intopt.mol
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(mol, mol.verbose)
+        t0 = (logger.process_clock(), logger.perf_counter())
+
+        orbo_cart = cupy.asarray(orbo_cart)
+        cart_aux_loc = intopt.cart_aux_loc
+        nao_cart = dm_cart.shape[0]
+        naux_cart = intopt._sorted_auxmol.nao
+        vj = vk = vjaux = vkaux = None
+        if with_j:
+            rhoj_cart = cupy.asarray(rhoj_cart)
+            dm_cart = cupy.asarray(dm_cart)
+            vj = cupy.zeros((3,nao_cart), order='C')
+            vjaux = cupy.zeros((3,naux_cart))
+        if with_k:
+            rhok_cart = cupy.asarray(rhok_cart)
+            vk = cupy.zeros((3,nao_cart), order='C')
+            vkaux = cupy.zeros((3,naux_cart))
+        
+        for cp_kl_id in task_list:
+            k0, k1 = cart_aux_loc[cp_kl_id], cart_aux_loc[cp_kl_id+1]
+            rhoj_tmp = rhok_tmp = None
+            if with_j:
+                rhoj_tmp = rhoj_cart[k0:k1]
+            if with_k:
+                rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
+                rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
+            '''
+            if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
+                rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
+
+            if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
+                rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
+            '''
+            '''
+            # outcore implementation
+            buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1)
+            size = 3*(k1-k0)*nao_cart*nao_cart
+            int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
+            rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart)
+            vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1])
+            vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip)
+
+            buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2)
+            int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
+            rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart)
+            vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1])
+            vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
+            '''
+            vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
+            if with_j: vj += vj_tmp
+            if with_k: vk += vk_tmp
+            vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
+            if with_j: vjaux[:, k0:k1] = vj_tmp
+            if with_k: vkaux[:, k0:k1] = vk_tmp
+
+            rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
+            t0 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t0)
+    return vj, vk, vjaux, vkaux
+
+def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, 
+                 with_j=True, with_k=True, omega=None):
+    '''
+    Calculate vj    = (i'j|L)(L|kl)(ij)(kl), vk    = (i'j|L)(L|kl)(ik)(jl)
+              vjaux = (ij|L')(L|kl)(ij)(kl), vkaux = (ij|L')(L|kl)(ik)(jl)
+    '''
+    nao_cart = dm_cart.shape[0]
+    block_size = with_df.get_blksize(nao=nao_cart)
+
+    intopt = VHFOpt(mol, auxmol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
+                 group_size_aux=block_size, verbose=0)#, group_size=block_size)
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
+                with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)
+            futures.append(future)
+
+    rhoj_total = []
+    rhok_total = []
+    vjaux_total = []
+    vkaux_total = []
+    for future in futures:
+        rhoj, rhok, vjaux, vkaux = future.result()
+        rhoj_total.append(rhoj)
+        rhok_total.append(rhok)
+        vjaux_total.append(vjaux)
+        vkaux_total.append(vkaux)
+
+    rhoj = rhok = vjaux = vkaux = None
+    if with_j:
+        rhoj = reduce_to_device(rhoj_total)
+        vjaux = reduce_to_device(vjaux_total)
+    if with_k:
+        rhok = reduce_to_device(rhok_total)
+        vkaux = reduce_to_device(vkaux_total)
+    return rhoj, rhok, vjaux, vkaux
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 681e18be..17816bc8 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -22,7 +22,7 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
-from gpu4pyscf.df.grad.jk import get_rhoj_rhok
+from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk
 
 LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
@@ -44,6 +44,7 @@ def j2c_solver(v):
     mask = w > lindep
     v1 = v[:,mask]
     j2c = cupy.dot(v1/w[mask], v1.conj().T)
+    w = v = v1 = mask = None
     def j2c_solver(b): # noqa: F811
         return j2c.dot(b.reshape(j2c.shape[0],-1)).reshape(b.shape)
     return j2c_solver
@@ -61,7 +62,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     # extended to any 1-particle density matrix
 
     if(dm0 is None): dm0 = mf_grad.base.make_rdm1()
-    mf = mf_grad.base
     if omega is None:
         with_df = mf_grad.base.with_df
     else:
@@ -91,7 +91,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     mo_coeff = None
     orbo = intopt.sort_orbitals(orbo, axis=[0])
 
-    rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k)
+    rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k)
     
     # (d/dX P|Q) contributions
     if omega and omega > 1e-10:
@@ -101,6 +101,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         int2c_e1 = auxmol.intor('int2c2e_ip1')
     int2c_e1 = cupy.asarray(int2c_e1)
 
+    rhoj_cart = rhok_cart = None
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low = with_df.cd_low
@@ -128,6 +129,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         elif low.tag == 'cd':
             #rhok = solve_triangular(low_t, rhok, lower=False)
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
+            rhok = rhok.copy(order='C')
         tmp = contract('pij,qij->pq', rhok, rhok)
         tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
@@ -142,12 +144,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     t0 = log.timer_debug1('rhoj and rhok', *t0)
     int2c_e1 = None
 
-    nao_cart = intopt._sorted_mol.nao
-    block_size = with_df.get_blksize(nao=nao_cart)
-
-    intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
-                 group_size_aux=block_size)#, group_size=block_size)
     dm_cart = dm
     orbo_cart = orbo
     if not mol.cart:
@@ -155,63 +151,14 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-
-    dm = orbo = None
-    vj = vk = rhoj_tmp = rhok_tmp = None
-    vjaux = vkaux = None
-
-    naux_cart = intopt._sorted_auxmol.nao
-    if with_j:
-        vj = cupy.zeros((3,nao_cart), order='C')
-        vjaux = cupy.zeros((3,naux_cart))
-    if with_k:
-        vk = cupy.zeros((3,nao_cart), order='C')
-        vkaux = cupy.zeros((3,naux_cart))
-    cupy.get_default_memory_pool().free_all_blocks()
-    t1 = log.init_timer()
-    for cp_kl_id in range(len(intopt.aux_log_qs)):
-        k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
-        assert k1-k0 <= block_size
-        if with_j:
-            rhoj_tmp = rhoj_cart[k0:k1]
-        if with_k:
-            rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
-            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
-        '''
-        if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
-            rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
-
-        if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
-            rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
-        '''
-        '''
-        # outcore implementation
-        buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1)
-        size = 3*(k1-k0)*nao_cart*nao_cart
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart)
-        vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1])
-        vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip)
-
-        buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2)
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart)
-        vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1])
-        vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
-        '''
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vj += vj_tmp
-        if with_k: vk += vk_tmp
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vjaux[:, k0:k1] = vj_tmp
-        if with_k: vkaux[:, k0:k1] = vk_tmp
-
-        rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
-        t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-    
+        
+    with_df._cderi = None # release GPU memory
+    vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
+                                        with_j=with_j, with_k=with_k, omega=omega)
     # NOTE: vj and vk are still in cartesian
     _sorted_mol = intopt._sorted_mol
     natm = _sorted_mol.natm
+    nao_cart = _sorted_mol.nao
     ao2atom = numpy.zeros([nao_cart, natm])
     ao_loc = _sorted_mol.ao_loc
     for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
@@ -225,6 +172,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
 
     _sorted_auxmol = intopt._sorted_auxmol
     natm = _sorted_auxmol.natm
+    naux_cart = _sorted_auxmol.nao
     aux2atom = numpy.zeros([naux_cart, natm])
     ao_loc = _sorted_auxmol.ao_loc
     for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
@@ -237,7 +185,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     if with_k:
         vkaux_3c = aux2atom.T @ vkaux.T
         vkaux = vkaux_2c - vkaux_3c
-    
     return vj, vk, vjaux, vkaux
 
 
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index fc8de3be..53acd7e0 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -18,11 +18,11 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import scf, gto
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library
+from gpu4pyscf.lib.cupy_helper import tag_array, contract
 from gpu4pyscf.grad import uhf as uhf_grad
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
-from gpu4pyscf.df.grad.jk import get_rhoj_rhok
+from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk
 
 FREE_CUPY_CACHE = True
 BINSIZE = 128
@@ -80,39 +80,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
     low = with_df.cd_low
-    rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k)
+    rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k)
     if dm2 is not None:
-        rhoj2, _   = get_rhoj_rhok(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False)
-    '''
-    rows = with_df.intopt.cderi_row
-    cols = with_df.intopt.cderi_col
-    dm_sparse = dm[rows, cols]
-    dm_sparse[with_df.intopt.cderi_diag] *= .5
-    if dm2 is not None:
-        dm2_sparse = dm2_tmp[rows, cols]
-        dm2_sparse[with_df.intopt.cderi_diag] *= .5
-
-    blksize = with_df.get_blksize()
-    if with_j:
-        rhoj = cupy.empty([naux])
-        if dm2 is not None:
-            rhoj2 = cupy.empty([naux])
-    if with_k:
-        rhok = cupy.empty([naux, nocc, nocc], order='C')
-    p0 = p1 = 0
-
-    for cderi, cderi_sparse in with_df.loop(blksize=blksize):
-        p1 = p0 + cderi.shape[0]
-        if with_j:
-            rhoj[p0:p1] = 2.0*dm_sparse.dot(cderi_sparse)
-            if dm2 is not None:
-                rhoj2[p0:p1] = 2.0*dm2_sparse.dot(cderi_sparse)
-        if with_k:
-            tmp = contract('Lij,jk->Lki', cderi, orbo)
-            contract('Lki,il->Lkl', tmp, orbo, out=rhok[p0:p1])
-        p0 = p1
-    tmp = dm_sparse = cderi_sparse = cderi = None
-    '''
+        rhoj2, _   = get_rhojk(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False)
 
     # (d/dX P|Q) contributions
     if omega and omega > 1e-10:
@@ -120,7 +90,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
             int2c_e1 = auxmol.intor('int2c2e_ip1')
     else:
         int2c_e1 = auxmol.intor('int2c2e_ip1')
+
     int2c_e1 = cupy.asarray(int2c_e1)
+    rhoj_cart = rhok_cart = None
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low_t = low.T.copy()
@@ -154,6 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
             rhok = contract('pq,qij->pij', low_t.T, rhok)
         elif low.tag == 'cd':
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
+            rhok = rhok.copy(order='C')
         tmp = contract('pij,qij->pq', rhok, rhok)
         tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
@@ -192,58 +165,10 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
         orbo_cart = orbo
     dm = orbo = None
 
-    vj = vk = rhoj_tmp = rhok_tmp = None
-    vjaux = vkaux = None
-
-    naux_cart = intopt._sorted_auxmol.nao
-    if with_j:
-        vj = cupy.zeros((3,nao_cart), order='C')
-        vjaux = cupy.zeros((3,naux_cart))
-    if with_k:
-        vk = cupy.zeros((3,nao_cart), order='C')
-        vkaux = cupy.zeros((3,naux_cart))
-    cupy.get_default_memory_pool().free_all_blocks()
-    t1 = log.init_timer()
-    for cp_kl_id in range(len(intopt.aux_log_qs)):
-        k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
-        assert k1-k0 <= block_size
-        if with_j:
-            rhoj_tmp = rhoj_cart[k0:k1]
-        if with_k:
-            rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
-            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
-        '''
-        if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
-            rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
-
-        if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
-            rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
-        '''
-        '''
-        # outcore implementation
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf)
-        size = 3*(k1-k0)*nao_cart*nao_cart
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart)
-        vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1])
-        vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip)
-
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf)
-        rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart)
-        vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1])
-        vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
-        '''
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vj += vj_tmp
-        if with_k: vk += vk_tmp
-
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vjaux[:, k0:k1] = vj_tmp
-        if with_k: vkaux[:, k0:k1] = vk_tmp
-
-        rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
-        t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-
+    with_df._cderi = None  # release GPU memory
+    vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
+                                        with_j=with_j, with_k=with_k, omega=omega)
+    
     # NOTE: vj and vk are still in cartesian
     _sorted_mol = intopt._sorted_mol
     natm = _sorted_mol.natm
@@ -260,6 +185,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
     _sorted_auxmol = intopt._sorted_auxmol
     natm = _sorted_auxmol.natm
+    naux_cart = _sorted_auxmol.nao
     aux2atom = np.zeros([naux_cart, natm])
     ao_loc = _sorted_auxmol.ao_loc
     for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
new file mode 100644
index 00000000..40ab3bfd
--- /dev/null
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -0,0 +1,443 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import itertools
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+import cupy
+from gpu4pyscf.df import int3c2e
+from gpu4pyscf.scf.int4c2e import libgint
+from gpu4pyscf.hessian.jk import _ao2mo
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
+from gpu4pyscf.__config__ import _streams, _num_devices
+
+NROOT_ON_GPU = 7
+
+def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
+                      with_j=True, with_k=True, hermi=0, device_id=0):
+    ''' Calculate J and K matrices with mo response
+        For CP-HF
+    '''
+    assert hermi == 1
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
+        log = logger.new_logger(dfobj.mol, dfobj.verbose)
+        t0 = log.init_timer()
+        dms = cupy.asarray(dms)
+        n_dm = dms.shape[0]
+        mo1s = [cupy.asarray(mo1) for mo1 in mo1s]
+        occ_coeffs = [cupy.asarray(occ_coeff) for occ_coeff in occ_coeffs]
+        mo_coeff = [cupy.asarray(mo) for mo in mo_coeff]
+        nao = dms.shape[-1]
+        intopt = dfobj.intopt
+        rows = intopt.cderi_row
+        cols = intopt.cderi_col
+        dms_shape = dms.shape
+        if with_j:
+            dm_sparse = dms[:,rows,cols]
+            if hermi == 0:
+                dm_sparse += dms[:,cols,rows]
+            else:
+                dm_sparse *= 2
+            dm_sparse[:, intopt.cderi_diag] *= .5
+        dms = None
+        
+        if with_k:
+            vks = [cupy.zeros_like(mo1) for mo1 in mo1s]
+
+        if with_j:
+            vj_sparse = cupy.zeros_like(dm_sparse)
+
+        nocc = max([mo1.shape[2] for mo1 in mo1s])
+        blksize = dfobj.get_blksize(extra=2*nao*nocc)
+        for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
+            if with_j:
+                rhoj = dm_sparse.dot(cderi_sparse)
+                vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
+                rhoj = None
+            cderi_sparse = None
+            if with_k:
+                for occ_coeff, mo1, vk in zip(occ_coeffs, mo1s, vks):
+                    nocc = occ_coeff.shape[1]
+                    rhok = contract('Lij,jo->Loi', cderi, occ_coeff)
+                    rhok_oo = contract('Loi,ip->Lop', rhok, occ_coeff).reshape([-1,nocc])
+                    rhok = rhok.reshape([-1,nao])
+                    for i in range(mo1.shape[0]):
+                        rhok1 = contract('Lij,jo->Loi', cderi, mo1[i])
+                        rhok1 = rhok1.reshape([-1,nao])
+                        vk[i] += cupy.dot(rhok1.T, rhok_oo)
+
+                        rhok1 = rhok1.reshape([-1,nocc,nao])
+                        rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff)
+                        rhok1 = rhok1.reshape([-1,nocc])
+                        vk[i] += cupy.dot(rhok.T, rhok1)
+                mo1 = rhok1 = rhok = rhok_oo = None
+            cderi = None
+        mo1s = None
+        if with_j:
+            vj = cupy.zeros(dms_shape)
+            vj[:,rows,cols] = vj_sparse
+            vj[:,cols,rows] = vj_sparse
+
+        vj_mo = vk_mo = None
+        if len(occ_coeffs) == 1:
+            # Restricted case
+            mo = mo_coeff[0]
+            if with_j:
+                vj_mo = _ao2mo(vj, occ_coeffs[0], mo).reshape(n_dm,-1)
+                vj = None
+            mo *= 2.0     # Due to double occupancy
+            if with_k:
+                vk_mo = contract('nio,ip->npo', vks[0], mo).reshape(n_dm,-1)
+        elif len(occ_coeffs) == 2:
+            # Unrestricted case
+            n_dm_2 = n_dm // 2
+            mocca, moccb = occ_coeffs
+            moa, mob = mo_coeff
+            nmoa, nmob = moa.shape[1], mob.shape[1]
+            nocca, noccb = mocca.shape[1], moccb.shape[1]
+
+            if with_j:
+                vjab = vj[:n_dm_2] + vj[n_dm_2:]
+                vj = None
+                vj_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vj_mo[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj_mo[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+                vjab = None
+
+            if with_k:
+                vka, vkb = vks
+                vk_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vk_mo[:,:nmoa*nocca] = contract('nio,ip->npo', vka, moa).reshape(n_dm_2,-1)
+                vk_mo[:,nmoa*nocca:] = contract('nio,ip->npo', vkb, mob).reshape(n_dm_2,-1)
+
+        t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0)
+    return vj_mo, vk_mo
+
+def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
+           with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
+    ''' Compute J/K in MO with density fitting
+    '''
+
+    log = logger.new_logger(dfobj.mol, dfobj.verbose)
+    if not isinstance(dms_tag, cupy.ndarray):
+        dms_tag = cupy.asarray(dms_tag)
+
+    assert(with_j or with_k)
+    if dms_tag is None: logger.error("dm is not given")
+    nao = dms_tag.shape[-1]
+    t1 = t0 = log.init_timer()
+    if dfobj._cderi is None:
+        log.debug('Build CDERI ...')
+        dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
+        t1 = log.timer_debug1('init jk', *t0)
+
+    assert nao == dfobj.nao
+    intopt = dfobj.intopt
+
+    nao = dms_tag.shape[-1]
+    dms = dms_tag.reshape([-1,nao,nao])
+    intopt = dfobj.intopt
+    dms = intopt.sort_orbitals(dms, axis=[1,2])
+
+    cupy.cuda.get_current_stream().synchronize()
+    occ_coeffs = dms_tag.occ_coeff
+    mo1s = dms_tag.mo1
+
+    if not isinstance(occ_coeffs, (tuple, list)):
+        occ_coeffs = [occ_coeffs]
+        mo1s = [mo1s]
+        mo_coeff = [mo_coeff]
+    else:
+        assert isinstance(mo1s, (tuple, list))
+        mo_coeff = [mo_coeff[0], mo_coeff[1]]
+
+    occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs]
+    mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
+    mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_task_with_mo1,
+                dfobj, dms, mo_coeff, mo1s, occ_coeffs,
+                hermi=hermi, device_id=device_id,
+                with_j=with_j, with_k=with_k)
+            futures.append(future)
+
+    vj = vk = None
+    if with_j:
+        vj = [future.result()[0] for future in futures]
+        vj = reduce_to_device(vj, inplace=True)
+
+    if with_k:
+        vk = [future.result()[1] for future in futures]
+        vk = reduce_to_device(vk, inplace=True)
+    t1 = log.timer_debug1('vj and vk', *t1)
+    return vj, vk
+
+
+def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, stream=None):
+
+    if omega is None: omega = 0.0
+    if stream is None: stream = cupy.cuda.get_current_stream()
+
+    fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type)
+    nao = intopt._sorted_mol.nao
+    naux = intopt._sorted_auxmol.nao
+    norb = nao + naux + 1
+    comp = 9
+    order = 2
+    nbins = 1
+
+    cp_kl_id = aux_id + len(intopt.log_qs)
+    lk = intopt.aux_angular[aux_id]
+
+    cpi = intopt.cp_idx[cp_ij_id]
+    cpj = intopt.cp_jdx[cp_ij_id]
+    li = intopt.angular[cpi]
+    lj = intopt.angular[cpj]
+
+    i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+    j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+    k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1]
+    ni = i1 - i0
+    nj = j1 - j0
+    nk = k1 - k0
+
+    log_q_ij = intopt.log_qs[cp_ij_id]
+    log_q_kl = intopt.aux_log_qs[aux_id]
+
+    bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+    bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32)
+
+    ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32)
+    strides = np.array([1, ni, ni*nj, ni*nj*nk], dtype=np.int32)
+
+    # Use GPU kernels for low-angular momentum
+    if (li + lj + lk + order)//2 + 1 < NROOT_ON_GPU:
+        int3c_blk = cupy.zeros([comp, nk, nj, ni], order='C', dtype=np.float64)
+        err = fn(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(norb),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_kl.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.c_int(cp_kl_id),
+            ctypes.c_double(omega))
+        if err != 0:
+            raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}')
+    else:
+        from pyscf.gto.moleintor import getints, make_cintopt
+        pmol = intopt._tot_mol
+        intor = pmol._add_suffix('int3c2e_' + ip_type)
+        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
+        # TODO: sph2cart in CPU?
+        ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
+        jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
+        kshl0, kshl1 = intopt.l_ctr_offsets[aux_id+1+intopt.nctr], intopt.l_ctr_offsets[aux_id+1+intopt.nctr+1]
+        shls_slice = np.array([ishl0, ishl1, jshl0, jshl1, kshl0, kshl1], dtype=np.int64)
+        int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
+        int3c_blk = cupy.asarray(int3c_cpu)
+
+    if not intopt.auxmol.cart:
+        int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
+    if not intopt.mol.cart:
+        int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
+        int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
+
+    return int3c_blk
+
+
+def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
+                        device_id=0, with_j=True, with_k=True, omega=None,
+                        auxbasis_response=1):
+    natm = intopt.mol.natm
+    nao = dm0.shape[0]
+    assert with_j or with_k
+    ao_loc = intopt.ao_loc
+    aux_ao_loc = intopt.aux_ao_loc
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
+        orbo = cupy.asarray(orbo)
+        dm0 = cupy.asarray(dm0)
+        nao = dm0.shape[0]
+        if with_j:
+            naux = rhoj.shape[0]
+            rhoj = cupy.asarray(rhoj)
+            hj_ipip1 = cupy.zeros([9,nao])
+            hj_ipip2 = cupy.zeros([9,naux])
+            hj_ip1ip2 = cupy.zeros([9,nao,naux])
+            hj_ipvip1 = cupy.zeros([9,nao,nao])
+        if with_k:
+            naux = rhok.shape[0]
+            rhok = cupy.asarray(rhok)
+            hk_ipip1 = cupy.zeros([9,nao])
+            hk_ipip2 = cupy.zeros([9,naux])
+            hk_ip1ip2 = cupy.zeros([9,nao,naux])
+            hk_ipvip1 = cupy.zeros([9,nao,nao])
+
+        cupy.get_default_memory_pool().free_all_blocks()
+        for aux_id, cp_ij_id in task_list:
+            cpi = intopt.cp_idx[cp_ij_id]
+            cpj = intopt.cp_jdx[cp_ij_id]
+            i0, i1 = ao_loc[cpi], ao_loc[cpi+1]
+            j0, j1 = ao_loc[cpj], ao_loc[cpj+1]
+            k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
+
+            if with_k:
+                rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1])
+                rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1])
+            
+            # (20|0), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
+            if with_j:
+                tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
+                hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1])
+            if with_k:
+                hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
+            int3c_blk = tmp = None
+
+            # (11|0), (0|0)(0|00) without response of RI basis
+            int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
+            if with_j:
+                tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
+                hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1])
+            if with_k:
+                hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
+            int3c_blk = tmp = None
+
+            if auxbasis_response < 1:
+                continue
+
+            # (10|1), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega)
+            if with_j:
+                tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp)
+            int3c_blk = tmp = None
+
+            if auxbasis_response < 2:
+                continue
+
+            # (00|2), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega)
+            if with_j:
+                tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp)
+            int3c_blk = tmp = None
+        auxslices = intopt.auxmol.aoslice_by_atom()
+        aoslices = intopt.mol.aoslice_by_atom()
+        ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
+        aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
+
+        hj = None
+        if with_j:
+            hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao])
+            tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1)
+            hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp)
+
+            hj_ipip1 = hj_ipip1.reshape([3,3,nao])
+            tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1)
+            hj[range(natm), range(natm)] += 2.0 * tmp
+
+        hk = None
+        if with_k:
+            hk_ipvip1 = hk_ipvip1.reshape([3,3,nao,nao])
+            tmp = contract('ia,xyij->ajxy', ao2atom, hk_ipvip1)
+            hk = contract('jb,ajxy->abxy', ao2atom, tmp)
+
+            hk_ipip1 = hk_ipip1.reshape([3,3,nao])
+            tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1)
+            hk[range(natm), range(natm)] += tmp
+
+        if auxbasis_response > 0:
+            if with_j:
+                hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux])
+                tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2)
+                tmp = contract('jb,ajxy->abxy',aux2atom, tmp)
+                tmp = tmp + tmp.transpose([1,0,3,2])
+                hj += tmp
+                if auxbasis_response > 1:
+                    hj += tmp
+            if with_k:
+                hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux])
+                tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2)
+                tmp = contract('jb,ajxy->abxy', aux2atom, tmp)
+                tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2]))
+                hk += tmp
+                if auxbasis_response > 1:
+                    hk += tmp
+
+        if auxbasis_response > 1:
+            if with_j:
+                hj_ipip2 = hj_ipip2.reshape([3,3,naux])
+                tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2)
+                hj[range(natm), range(natm)] += tmp
+            if with_k:
+                hk_ipip2 = hk_ipip2.reshape([3,3,naux])
+                tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2)
+                hk[range(natm), range(natm)] += .5 * tmp
+        t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0)
+    return hj, hk
+
+def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
+                    omega=None, auxbasis_response=1):
+    orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
+    futures = []
+    ncp_k = len(intopt.aux_log_qs)
+    ncp_ij = len(intopt.log_qs)
+    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
+    task_list = []
+    for device_id in range(_num_devices):
+        task_list.append(tasks[device_id::_num_devices])
+
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _int3c2e_ipip_tasks, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
+                device_id=device_id, omega=omega,
+                auxbasis_response=auxbasis_response)
+            futures.append(future)
+
+    hj_total = []
+    hk_total = []
+    for future in futures:
+        hj, hk = future.result()
+        hj_total.append(hj)
+        hk_total.append(hk)
+
+    hj = hk = None
+    if with_j:
+        hj = reduce_to_device(hj_total, inplace=True)
+    if with_k:
+        hk = reduce_to_device(hk_total, inplace=True)
+    return hj, hk
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index aaf1c16e..2eab8ef5 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -30,15 +30,15 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.lib.cupy_helper import (
-    contract, tag_array, get_avail_mem, release_gpu_stack, pinv)
+    contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array)
 from gpu4pyscf.df import int3c2e, df
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
 from gpu4pyscf.df.grad.rhf import _gen_metric_solver
-from gpu4pyscf.gto.mole import sort_atoms
+from gpu4pyscf.df.hessian import jk
 
 LINEAR_DEP_THR = df.LINEAR_DEP_THR
-BLKSIZE = 128
+BLKSIZE = 256
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
 GB = 1024*1024*1024
 
@@ -53,11 +53,13 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
     '''
     nnz = rhok1_Pko.shape[0]
     nao = dm0.shape[0]
+    hk_ao_ao = cupy.zeros([nao,nao,3,3])
+    cupy.get_default_memory_pool().free_all_blocks()
     mem_avail = get_avail_mem()
     blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
-    hk_ao_ao = cupy.zeros([nao,nao,3,3])
     for k0, k1 in lib.prange(0,nnz,blksize):
-        rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
+        #rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
+        rhok1_Pko_kslice = copy_array(rhok1_Pko[k0:k1])
 
         # (10|0)(0|10) without response of RI basis
         vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice)
@@ -67,12 +69,11 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
         # (10|0)(0|01) without response of RI basis
         rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2)
         hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice)
-        rhok1_Pkl_kslice = None
+        rhok1_Pkl_kslice = rhok1_Pko_kslice = None
     return hk_ao_ao
 
-
-def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
-                      atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
+def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None,
+                      max_memory=None, verbose=None, with_j=True, with_k=True, omega=None):
     '''Partial derivative
     '''
     log = logger.new_logger(hessobj, verbose)
@@ -110,7 +111,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
     naux = auxmol.nao
     mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0])
     dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
@@ -119,55 +121,66 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     int2c = cupy.asarray(int2c, order='C')
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
-
-    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
-
-    hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    hk_ao_ao = cupy.zeros([nao,nao,3,3])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        hk_ao_aux = cupy.zeros([nao,naux,3,3])
-
+    
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
+    rhoj0_P = rhok0_P__ = None
+
+    if with_j:
+        rhoj0_P = solve_j2c(wj)
+        wj = None
+    if with_k:
+        rhok0_P__ = solve_j2c(wk_P__)
+        wk_P__ = None
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
-    rhoj0_P = solve_j2c(wj)
-    rhok0_P__ = solve_j2c(wk_P__)
-    wj = wk_P__ = None
+
+    hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
+                                          with_j=with_j, with_k=with_k, omega=omega,
+                                          auxbasis_response=hessobj.auxbasis_response)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
     # int3c_ip2 contributions
     wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1)
 
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
+    
+    cupy.get_default_memory_pool().free_all_blocks()
+    release_gpu_stack()
+
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
-    rhoj1_P = solve_j2c(wj1_P)
+    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-    hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
-    wj1_P = None
-    if hessobj.auxbasis_response:
-        wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
-        wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-        hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
-        hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
-        hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
-        wj1_01 = None
-    rhoj1_P = None
+    if with_j:
+        rhoj1_P = solve_j2c(wj1_P)
+        hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        wj1_P = None
+        if hessobj.auxbasis_response:
+            wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
+            wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
+            hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
+            hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
+            wj1_01 = None
+        rhoj1_P = None
 
     if with_k:
         cupy.get_default_memory_pool().free_all_blocks()
         mem_avail = get_avail_mem()
         nocc = mocc.shape[1]
         slice_size = naux*nocc*9   # largest slice of intermediate variables
-        blksize = int(mem_avail*0.2/8/slice_size/ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.2/8/slice_size)
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
-        if blksize < ALIGNED:
-            raise RuntimeError('Not enough memory for intermediate variables')
-
+        assert blksize > 0
+        if hessobj.auxbasis_response:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1])
+            #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1])
+            wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1])
+
             #rhok1_Pko = contract('pq,qiox->piox', int2c_inv, wk1_Pko_islice)
             rhok1_Pko = solve_j2c(wk1_Pko_islice)
             wk1_Pko_islice = None
@@ -188,6 +201,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                 hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I)
                 wk1_I = rhok0_P_I = None
         rhok1_Pko = None
+        t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1)
+        
+        rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
+        rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
+        rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
+        rhok0_P__ = wk_ip2_P__ = None
 
         w, v = cupy.linalg.eigh(int2c)
         idx = w > LINEAR_DEP_THR
@@ -197,55 +216,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
         rhok1_Pko = wk1_Pko[:nnz]  # Reuse the same memory
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1])
+            #wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1])
+            wk1_tmp = copy_array(wk1_Pko[:,i0:i1])
             if isinstance(rhok1_Pko, cupy.ndarray):
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp)
             else:
-                rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
+                #rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
+                wk1_tmp = contract('qp,qiox->piox', cd_low, wk1_tmp)
+                copy_array(wk1_tmp, rhok1_Pko[:,i0:i1])
             wk1_tmp = None
         cd_low = None
-        
-        hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
+        hk_ao_ao = _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
-
-    cupy.get_default_memory_pool().free_all_blocks()
-    #  int3c_ipip1 contributions
-    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_hjk(intopt, 'ipip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-    hj_ao_diag *= 2.0
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
-
-    #  int3c_ipvip1 contributions
-    # (11|0), (0|00) without response of RI basis
-    hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipvip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-    hj_ao_ao += 2.0*hj
-    if with_k:
-        hk_ao_ao += hk
-    hj = hk = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
+    solve_j2c = None
+    t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1)
 
-    #  int3c_ip1ip2 contributions
-    # (10|1), (0|0)(0|00)
-    if hessobj.auxbasis_response:
-        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ip1ip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-        hj_ao_aux += hj
-        if with_k:
-            hk_ao_aux += hk
-        hj = hk = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
-
-    #  int3c_ipip2 contributions
-    if hessobj.auxbasis_response > 1:
-        # (00|2), (0|0)(0|00)
-        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-        hj_aux_diag = hj
-        if with_k:
-            hk_aux_diag = .5*hk
-        hj = hk = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
-    
     # int2c contributions
     if hessobj.auxbasis_response > 1:
+        cupy.get_default_memory_pool().free_all_blocks()
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
@@ -253,13 +241,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
         int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
-        rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+
         # (00|0)(2|0)(0|00)
         # p,xp->px
-        hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        if with_j:
+            rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+            hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
-            rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
-            hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
+            hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
         if omega and omega > 1e-10:
@@ -269,41 +258,34 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
         int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
-        hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
+        if with_j:
+            hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
         int2c_ip1ip2 = None
 
-    cupy.get_default_memory_pool().free_all_blocks()
-    release_gpu_stack()
-    # aux-aux pair
-    if hessobj.auxbasis_response > 1:
+        # aux-aux pair
         int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR)
-        wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
         int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
-
-        rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
-        hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
-        rhoj0_10 = rhoj0_P = None
-
-        rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
-        hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
-        hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
-        wj0_10 = rhoj1 = wj_ip2 = None
-
-        rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
-        hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
-        wj0_01 = rhoj0_01 = None
+        if with_j:
+            wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
+            rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
+            hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
+            rhoj0_10 = rhoj0_P = None
+
+            rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
+            hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
+            hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
+            wj0_10 = rhoj1 = wj_ip2 = None
+
+            rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
+            hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
+            wj0_01 = rhoj0_01 = None
 
         if with_k:
-            rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
-            rhok0_P__ = None
-
-            rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
-            wk_ip2_P__ = None
             hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_11 = None
 
@@ -327,26 +309,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
     dm0 = intopt.unsort_orbitals(dm0, axis=[0,1])
-    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
-    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
-    if hessobj.auxbasis_response > 1:
-        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
-        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
+    if with_j:
+        hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
+        if hessobj.auxbasis_response:
+            hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
+        if hessobj.auxbasis_response > 1:
+            hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
             hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
         if hessobj.auxbasis_response > 1:
-            hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
             hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
     #======================================== sort AO end ===========================================
     # Energy weighted density matrix
     # pi,qi,i->pq
     dme0 = cupy.dot(mocc, (mocc * mo_energy[mo_occ>0] * 2).T)
     de_hcore = rhf_hess._e_hcore_generator(hessobj, dm0)
+    t1 = log.timer_debug1('hcore generate', *t1)
 
     # ------------------------------------
     #      overlap matrix contributions
@@ -360,19 +340,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # -----------------------------------------
     #        collecting all
     # -----------------------------------------
-    e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ej = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ek = cupy.zeros([len(atmlst),len(atmlst),3,3])
+    natm = len(atmlst)
+    e1 = cupy.zeros([natm,natm,3,3])
+    ej = hj_ipip
+    ek = hk_ipip
+
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
-        ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0)
-        if with_k:
-            ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
-            ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1])
+            if with_j:
+                ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             if with_k:
                 ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] += de_hcore(ia, ja)
@@ -381,13 +361,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response:
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
-                if hessobj.auxbasis_response > 1:
-                    ej[i0,j0] += _ej * 2
-                    ej[j0,i0] += _ej.T * 2
-                else:
-                    ej[i0,j0] += _ej
-                    ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
+                    if hessobj.auxbasis_response > 1:
+                        ej[i0,j0] += _ej * 2
+                        ej[j0,i0] += _ej.T * 2
+                    else:
+                        ej[i0,j0] += _ej
+                        ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1])
                     if hessobj.auxbasis_response > 1:
@@ -401,13 +382,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response > 1:
             shl0, shl1, p0, p1 = auxslices[ia]
-            ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
-            if with_k:
-                ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0)
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
-                ej[i0,j0] += _ej
-                ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
+                    ej[i0,j0] += _ej
+                    ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek * .5
@@ -415,9 +394,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
-            ej[j0,i0] = ej[i0,j0].T
-            ek[j0,i0] = ek[i0,j0].T
+            if with_j:
+                ej[j0,i0] = ej[i0,j0].T
+            if with_k:
+                ek[j0,i0] = ek[i0,j0].T
     t1 = log.timer_debug1('hcore contribution', *t1)
+
+    aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
+
+    natm = mol.natm
+    idx = range(natm)
+    # Diagonal contributions
+    if hessobj.auxbasis_response > 1:
+        if with_j:
+            ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag)
+        if with_k:
+            ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag)
+
     log.timer('RHF partial hessian', *time0)
     return e1, ej, ek
 
@@ -425,19 +418,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
     natm = mol.natm
-    nocc = int(cupy.count_nonzero(mo_occ > 0))
-    nmo = len(mo_occ)
-    h1ao = cupy.empty((natm, 3, nmo, nocc))
-    for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                    atmlst, verbose, True):
-        h1 += vj1 - vk1 * .5
-        h1ao[ia] = h1
-    return h1ao
-
-def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
-            verbose=None, with_k=True, omega=None):
+    assert atmlst is None or atmlst ==range(natm)
+    vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True)
+    # h1mo = h1 + vj - 0.5 * vk
+    h1mo = vk
+    h1mo *= -.5
+    h1mo += vj
+    h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
+    return h1mo
+
+def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
+            verbose=None, with_j=True, with_k=True, omega=None):
     '''
-    A generator to produce the derivatives of Hcore, J, K matrices in MO bases
+    Derivatives of J, K matrices in MO bases
     '''
     log = logger.new_logger(hessobj, verbose)
     t0 = log.init_timer()
@@ -447,8 +440,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mol = hessobj.mol
     if atmlst is None:
         atmlst = range(mol.natm)
-    # FIXME
-    with_k = True
+
     mo_coeff = cupy.asarray(mo_coeff, order='C')
     mo_occ = cupy.asarray(mo_occ, order='C')
 
@@ -475,7 +467,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     intopt.build(mf.direct_scf_tol,
                  diag_block_with_triu=True,
-                 aosym=False,
+                 aosym=False, verbose=0,
                  group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
     naux = auxmol.nao
@@ -484,26 +476,31 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0])
     dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
     dm0_tag = tag_array(dm0, occ_coeff=mocc)
-    
+
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
-    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
-    rhoj0 = solve_j2c(wj)
+    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag,
+                                         with_j=with_j, with_k=True, omega=omega)
+    rhoj0 = None
+    if with_j:
+        rhoj0 = solve_j2c(wj)
+        wj = None
 
-    wj = None
     if isinstance(wk_Pl_, cupy.ndarray):
         rhok0_Pl_ = solve_j2c(wk_Pl_)
     else:
-        #rhok0_Pl_ = np.empty_like(wk_Pl_)
-        #mem = cupy.cuda.alloc_pinned_memory(wk_Pl_.nbytes)
-        #rhok0_Pl_ = np.ndarray(wk_Pl_.shape, dtype=np.float64, order='C', buffer=mem)
         rhok0_Pl_ = wk_Pl_ # reuse the memory
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
-            rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            #wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
+            #rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wk_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0_Pl_[:,p0:p1])
         wk_tmp = None
-    wk_Pl_ = solve_j2c = None
+    wk_Pl_ = None
+    solve_j2c = None
     t0 = log.timer_debug1('Fock matrix due to int3c2e', *t0)
+    vj1_int3c = vk1_int3c = None
 
     # --------------------------
     #  int3c_ip2 contribution
@@ -511,8 +508,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     cupy.get_default_memory_pool().free_all_blocks()
     if hessobj.auxbasis_response:
         fn = int3c2e.get_int3c2e_ip2_vjk
-        vj1_int3c_ip2, vk1_int3c_ip2 = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, omega=omega)
-        vk1_int3c_ip2 *= 2.0
+        vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices,
+                                  with_j=with_j, with_k=with_k, omega=omega)
+        t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
+
         # Responses due to int2c2e_ip1
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
@@ -522,64 +521,77 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
         int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-        # Generate rhok0_P__
-        if isinstance(rhok0_Pl_, cupy.ndarray):
-            rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc)
-        else:
-            rhok0_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
-                rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc)
-            rhok0_Pl_tmp = None
-
-        wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
-        wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
+        if with_j:
+            wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
+        if with_k:
+            # Generate rhok0_P__
+            if isinstance(rhok0_Pl_, cupy.ndarray):
+                rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc)
+            else:
+                rhok0_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0_Pl_[p0:p1])
+                    rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc)
+                rhok0_Pl_tmp = None
+            wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
 
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
         mem_avail = get_avail_mem()
         blksize = int(0.2*mem_avail/(3*naux*nocc*8)/ALIGNED) * ALIGNED
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} AOs per block')
         if blksize < ALIGNED:
-            raise RuntimeError('Not enough memory to compute int3c2e_ip2')
+            raise RuntimeError('Not enough memory to compute int2c2e_ip2')
 
         for p0, p1 in lib.prange(0,nao,blksize):
-            rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
-            vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
-
+            #rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
+            rhok_tmp = copy_array(rhok0_Pl_[:,p0:p1])
             wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp)
-            vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
-            vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
-            vj1_tmp = None
+            if with_j:
+                vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
+                vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
+                vj1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
+                vj1_tmp = None
             if with_k:
                 vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
                 vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
-                # 2.0 due to spin
-                vk1_int3c_ip2[:,:,p0:p1] += 2.0*contract('xpio,pa->axio', vk1_tmp, aux2atom)
+                vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom)
                 vk1_tmp = None
             wk0_10_Pl_ = rhok_tmp = None
         wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
         aux2atom = None
-
-        vj1_int3c_ip2 = contract('nxiq,ip->nxpq', vj1_int3c_ip2, mo_coeff)
-        vk1_int3c_ip2 = contract('nxiq,ip->nxpq', vk1_int3c_ip2, mo_coeff)
-        t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
+        t0 = log.timer_debug1('Fock matrix due to int2c2e_ip1', *t0)
 
     # -----------------------------
     # int3c_ip1 contributions
     # ------------------------------
     cupy.get_default_memory_pool().free_all_blocks()
     fn = int3c2e.get_int3c2e_ip1_vjk
-    vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega)
-    rhoj0 = rhok0_Pl_ = None
-    vk1_ao *= 2.0
-    vk1_buf *= 2.0
-    
-    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
-    vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
+    vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices,
+                                          omega=omega, with_j=with_j, with_k=with_k)
+    rhoj0 = rhok0_Pl_ = dm0_tag = None
+    if with_j:
+        vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+        if vj1_int3c is None:
+            vj1_int3c = -vj1_ao
+        else:
+            vj1_int3c -= vj1_ao
+        vj1_ao = None
+        # NOTE: vj1_int3c and vk1_int3c are in [natm,3,nao,nocc]
+        #       axis=2 in AO, axis=3 in MO
+        #       convert axis=2 into MO now
+        vj1_int3c = contract('nxiq,ip->nxpq', vj1_int3c, mo_coeff)
 
-    vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
-    vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
-    vj1_ao = vk1_ao = None
+    if with_k:
+        vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
+        if vk1_int3c is None:
+            vk1_int3c = -vk1_ao
+        else:
+            vk1_int3c -= vk1_ao
+        vk1_ao = None
+        # * 2.0 due to the contraction with mocc
+        vk1_buf *= 2.0
+        vk1_int3c = 2.0 * contract('nxiq,ip->nxpq', vk1_int3c, mo_coeff)
     t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
 
     mocc = intopt.unsort_orbitals(mocc, axis=[0])
@@ -591,40 +603,48 @@ def _ao2mo(mat):
         tmp = contract('xij,jo->xio', mat, mocc)
         return contract('xik,ip->xpk', tmp, mo_coeff)
 
-    vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2
-    vj1_int3c_ip1 = vj1_int3c_ip2 = None
-    if with_k:
-        vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
-        vk1_int3c_ip1 = vk1_int3c_ip2 = None
-
-    grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
     cupy.get_default_memory_pool().free_all_blocks()
-    vk1 = None
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
-        vj1_ao = cupy.zeros([3,nao,nao])
-        vk1_ao = cupy.zeros([3,nao,nao])
-
-        vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
-        vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+        if with_j:
+            vj1_ao = cupy.zeros([3,nao,nao])
+            vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
+            vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+            vj1_int3c[ia] += _ao2mo(vj1_ao)
         if with_k:
+            vk1_ao = cupy.zeros([3,nao,nao])
             vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
+            vk1_int3c[ia] += _ao2mo(vk1_ao)
+    return vj1_int3c, vk1_int3c
+
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc,
+           hermi=1, with_j=True, with_k=True, omega=None):
+    mf = hessobj.base
+    dfobj = mf.with_df
+    if omega is None:
+        return jk.get_jk(dfobj, dms, mo_coeff, mocc,
+                         hermi=hermi, with_j=with_j, with_k=with_k)
+
+    # A temporary treatment for RSH-DF integrals
+    key = '%.6f' % omega
+    if key in dfobj._rsh_df:
+        rsh_df = dfobj._rsh_df[key]
+    else:
+        rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset()
+        logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
+
+    with rsh_df.mol.with_range_coulomb(omega):
+        return jk.get_jk(rsh_df, dms, mo_coeff, mocc,
+                         hermi=hermi, with_j=with_j, with_k=with_k, omega=omega)
 
-        h1 =  grad_hcore[i0]
-        vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
-        if with_k:
-            vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
-        yield ia, h1, vj1, vk1
 
 class Hessian(rhf_hess.Hessian):
     '''Non-relativistic restricted Hartree-Fock hessian'''
 
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = rhf_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
+    get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 3643d8ad..e0d5cd90 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -23,9 +23,11 @@
 import numpy
 import cupy
 from pyscf import lib
+from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import rks as rks_hess
 from gpu4pyscf.df.hessian import rhf as df_rhf_hess
+from gpu4pyscf.df.hessian.rhf import _get_jk_ip, _partial_hess_ejk
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract
 
@@ -49,17 +51,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
-    de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                                atmlst, max_memory, verbose,
-                                                with_k=with_k)
+    de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                    atmlst, max_memory, verbose,
+                                    with_j=True, with_k=with_k)
     de2 += ej  # (A,B,dR_A,dR_B)
     if with_k:
         de2 -= hyb * ek
 
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        ek_lr = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                            atmlst, max_memory, verbose,
-                                            True, omega=omega)[2]
+        ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                  atmlst, max_memory, verbose,
+                                  with_j=False, with_k=True, omega=omega)[2]
         de2 -= (alpha - hyb) * ek_lr
 
     max_memory = None
@@ -84,33 +86,38 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     mf = hessobj.base
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                          atmlst, verbose, with_j=True, with_k=with_k)
+    h1mo = vj1
+    if with_k:
+        h1mo -= .5 * hyb * vk1
+    vj1 = vk1 = None
 
-    for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, with_k):
-        h1mo[ia] += h1 + vj1
-        if with_k:
-            h1mo[ia] -= .5 * hyb * vk1
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, True, omega=omega):
-            h1mo[ia] -= .5 * (alpha - hyb) * vk1_lr
+        _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst,
+                               verbose, with_j=False, with_k=True, omega=omega)
+        h1mo -= .5 * (alpha - hyb) * vk1_lr
+        vk1_lr = None
+
+    h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
+    h1mo += rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
     return h1mo
 
 class Hessian(rks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = rks_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
+    get_jk_mo = df_rhf_hess._get_jk_mo
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
new file mode 100644
index 00000000..a3e13260
--- /dev/null
+++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
@@ -0,0 +1,145 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+import numpy
+import cupy
+from pyscf import gto, scf
+from pyscf.df.hessian import rhf as df_rhf_cpu
+from pyscf.hessian import rhf as rhf_cpu
+from gpu4pyscf.df.hessian import rhf as df_rhf_gpu
+from gpu4pyscf.hessian import rhf as rhf_gpu
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.verbose = 1
+    mol.output = '/dev/null'
+    mol.atom.extend([
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ])
+    mol.basis = 'sto3g'
+    mol.build()
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+class KnownValues(unittest.TestCase):
+    def test_gen_vind(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        nao, nmo = mo_coeff.shape
+        mocc = mo_coeff[:,mo_occ>0]
+        nocc = mocc.shape[1]
+
+        fx_cpu = rhf_cpu.gen_vind(mf, mo_coeff, mo_occ)
+        mo1 = numpy.random.rand(100, nmo*nocc)
+        v1vo_cpu = fx_cpu(mo1).reshape(-1,nmo*nocc)
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ)
+        mo1 = cupy.asarray(mo1)
+        v1vo_gpu = fx_gpu(mo1)
+        assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8
+
+    def test_partial_hess_elec(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        e1_cpu, ej_cpu, ek_cpu = df_rhf_cpu._partial_hess_ejk(hobj)
+
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        e1_gpu, ej_gpu, ek_gpu = df_rhf_gpu._partial_hess_ejk(hobj)
+        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-5
+        assert numpy.linalg.norm(ej_cpu - ej_gpu.get()) < 1e-5
+        assert numpy.linalg.norm(ek_cpu - ek_gpu.get()) < 1e-5
+
+    def test_make_h1(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        mo_energy = mf.mo_energy
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+        mocc = mo_coeff[:,mo_occ>0]
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 1
+        h1_cpu = df_rhf_cpu.make_h1(hobj, mo_coeff, mo_occ)
+        mo1_cpu, mo_e1_cpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_cpu, verbose=1)
+        h1_cpu = numpy.asarray(h1_cpu)
+        h1_cpu = numpy.einsum('xypq,pi,qj->xyij', h1_cpu, mo_coeff, mocc)
+
+        mf = mf.to_gpu()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 1
+        mo_occ = cupy.asarray(mo_occ)
+        h1_gpu = df_rhf_gpu.make_h1(hobj, mo_coeff, mo_occ)
+        h1_gpu = cupy.asarray(h1_gpu)
+        mo_energy = cupy.asarray(mo_energy)
+        mo_coeff = cupy.asarray(mo_coeff)
+        fx = hobj.gen_vind(mo_coeff, mo_occ)
+        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_gpu, fx, verbose=1)
+        assert numpy.linalg.norm(h1_cpu - h1_gpu.get()) < 1e-5
+        assert numpy.linalg.norm((mo_e1_cpu - mo_e1_gpu)) < 1e-4
+
+    def test_df_rhf_hess_elec(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.hess_elec()
+
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.hess_elec()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5
+
+    def test_df_rhf_hessian(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.kernel()
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+if __name__ == "__main__":
+    print("Full Tests for DF RHF Hessian")
+    unittest.main()
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
new file mode 100644
index 00000000..f737e92a
--- /dev/null
+++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
@@ -0,0 +1,107 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+import numpy
+from pyscf import gto, dft
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.verbose = 1
+    mol.output = '/dev/null'
+    mol.atom.extend([
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ])
+    mol.basis = 'sto3g'
+    mol.build()
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+class KnownValues(unittest.TestCase):
+
+    def test_df_rks_hess_elec(self):
+        mf = dft.RKS(mol, xc='b3lyp').density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.grids.level = 1
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.partial_hess_elec()
+
+        mf = mf.to_gpu()
+        mf.grids.level = 1
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.partial_hess_elec()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5
+
+    def test_df_lda(self):
+        mf = dft.RKS(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+    def test_df_gga(self):
+        mf = dft.RKS(mol, xc='b3lyp').density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+    def test_df_mgga(self):
+        mf = dft.RKS(mol, xc='tpss').density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+if __name__ == "__main__":
+    print("Full Tests for DF RKS Hessian")
+    unittest.main()
+    
\ No newline at end of file
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
index 5a4bbb74..f3094095 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
@@ -61,7 +61,8 @@ def test_gen_vind(self):
         v1vo_cpu = fx_cpu(mo1)
 
         mf = mf.to_gpu()
-        fx_gpu = uhf_gpu.gen_vind(mf, mo_coeff, mo_occ)
+        hessobj = mf.Hessian()
+        fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1 = cupy.asarray(mo1)
         v1vo_gpu = fx_gpu(mo1)
         assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8
@@ -113,7 +114,8 @@ def test_make_h1(self):
         mo_energy = cupy.asarray(mo_energy)
         mo_coeff = cupy.asarray(mo_coeff)
         mo_occ = cupy.asarray(mo_occ)
-        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), verbose=1)
+        fx = hobj.gen_vind(mo_coeff, mo_occ)
+        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), fx, verbose=1)
         assert numpy.linalg.norm(h1a_cpu - h1a_gpu.get()) < 1e-5
         assert numpy.linalg.norm(h1b_cpu - h1b_gpu.get()) < 1e-5
         mo1_cpu = (numpy.asarray(mo1_cpu[0]), numpy.asarray(mo1_cpu[1]))
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 29d016a4..5e94a248 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -34,26 +34,29 @@
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.lib.cupy_helper import (
-    contract, tag_array, get_avail_mem, release_gpu_stack, pinv)
+    contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array)
 from gpu4pyscf.df import int3c2e, df
+from gpu4pyscf.df.hessian import rhf as df_rhf_hess
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
 from gpu4pyscf.df.grad.rhf import _gen_metric_solver
-from gpu4pyscf.gto.mole import sort_atoms
+from gpu4pyscf.df.hessian import jk
 
 LINEAR_DEP_THR = df.LINEAR_DEP_THR
-BLKSIZE = 256
+BLKSIZE = 128
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
 GB = 1024*1024*1024
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
     e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                   atmlst, max_memory, verbose, True)
+                                   atmlst, max_memory, verbose,
+                                   with_j=True, with_k=True)
     return e1 + ej - ek
 
 def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
-                      atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
+                      atmlst=None, max_memory=4000, verbose=None,
+                      with_j=True, with_k=True, omega=None):
     '''Partial derivative
     '''
     log = logger.new_logger(hessobj, verbose)
@@ -93,7 +96,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
     mocca = intopt.sort_orbitals(mocca, axis=[0])
     moccb = intopt.sort_orbitals(moccb, axis=[0])
@@ -112,43 +116,43 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
     int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-    hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    hk_ao_ao = cupy.zeros([nao,nao,3,3])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        hk_ao_aux = cupy.zeros([nao,naux,3,3])
-
     #  int3c contributions
     wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega)
     wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega)
-    rhoj0_P = solve_j2c(wja + wjb)
-    rhok0a_P__ = solve_j2c(wka_P__)
-    rhok0b_P__ = solve_j2c(wkb_P__)
+    rhoj0_P = rhok0a_P__ = rhok0b_P__ = None
+    if with_j:
+        rhoj0_P = solve_j2c(wja + wjb)
+    if with_k:
+        rhok0a_P__ = solve_j2c(wka_P__)
+        rhok0b_P__ = solve_j2c(wkb_P__)
     wja = wjb = wka_P__ = wkb_P__ = None
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
     # int3c_ip2 contributions
     wja_ip2, wka_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0a_tag, omega=omega)
     wjb_ip2, wkb_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0b_tag, omega=omega)
-    wj_ip2 = wja_ip2 + wjb_ip2
+    wj_ip2 = None
+    if with_j:
+        wj_ip2 = wja_ip2 + wjb_ip2
     t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1)
 
     #  int3c_ip1 contributions
     wj1a_P, wk1a_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0a_tag, omega=omega)
     wj1b_P, wk1b_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0b_tag, omega=omega)
-    wj1_P = wj1a_P + wj1b_P
-    rhoj1_P = solve_j2c(wj1_P)
-
-    hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
     wj1_P = None
-    if hessobj.auxbasis_response:
-        wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
-        wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-        hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
-        hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
-        hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
-        wj1_01 = None
-    rhoj1_P = None
+    if with_j:
+        wj1_P = wj1a_P + wj1b_P
+        rhoj1_P = solve_j2c(wj1_P)
+        hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        wj1_P = None
+        if hessobj.auxbasis_response:
+            wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
+            wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
+            hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
+            hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
+            wj1_01 = None
+        rhoj1_P = None
 
     if with_k:
         mem_avail = get_avail_mem()
@@ -159,17 +163,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory for intermediate variables')
-    
+        hk_ao_ao = cupy.zeros([nao,nao,3,3])
+        if hessobj.auxbasis_response:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
-            wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
+            #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
+            #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
+            wk1a_Pko_islice = copy_array(wk1a_Pko[:,i0:i1])
+            wk1b_Pko_islice = copy_array(wk1b_Pko[:,i0:i1])
             rhok1a_Pko = solve_j2c(wk1a_Pko_islice)
             rhok1b_Pko = solve_j2c(wk1b_Pko_islice)
             wk1a_Pko_islice = wk1b_Pko_islice = None
             for k0, k1 in lib.prange(0,nao,blksize):
-                wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1])
-                wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1])
-
+                #wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1])
+                #wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1])
+                wk1a_Pko_kslice = copy_array(wk1a_Pko[:,k0:k1])
+                wk1b_Pko_kslice = copy_array(wk1b_Pko[:,k0:k1])
+                
                 # (10|0)(0|10) without response of RI basis
                 vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice)
                 hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0a[i0:i1,k0:k1])
@@ -214,49 +224,30 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     cupy.get_default_memory_pool().free_all_blocks()
-    #  int3c_ipip1 contributions
-    fn = int3c2e.get_int3c2e_hjk
-    hja_ao_diag, hka_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb_ao_diag, hkb_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-    hj_ao_diag = 2.0 * (hja_ao_diag + hjb_ao_diag)
-    if with_k:
-        hk_ao_diag = 2.0 * (hka_ao_diag + hkb_ao_diag)
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
-
-    #  int3c_ipvip1 contributions
-    # (11|0), (0|00) without response of RI basis
-    fn = int3c2e.get_int3c2e_hjk
-    hja, hka = fn(intopt, 'ipvip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb, hkb = fn(intopt, 'ipvip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-    hj_ao_ao += 2.0*(hja + hjb)
+    hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag,
+                                          with_j=with_j, with_k=with_k, omega=omega,
+                                          auxbasis_response=hessobj.auxbasis_response)
+    hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag,
+                                          with_j=with_j, with_k=with_k, omega=omega,
+                                          auxbasis_response=hessobj.auxbasis_response)
+    if with_j:
+        hj_ipip = hja_ipip + hjb_ipip
     if with_k:
-        hk_ao_ao += (hka + hkb)
-    hja = hjb = hka = hkb = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
-
-    #  int3c_ip1ip2 contributions
-    # (10|1), (0|0)(0|00)
-    if hessobj.auxbasis_response:
-        fn = int3c2e.get_int3c2e_hjk
-        hja, hka = fn(intopt, 'ip1ip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, 'ip1ip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-        hj_ao_aux += hja + hjb
-        if with_k:
-            hk_ao_aux += hka + hkb
-        hja = hjb = hka = hkb = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
+        hk_ipip = 2.0*(hka_ipip + hkb_ipip)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
-    #  int3c_ipip2 contributions
     if hessobj.auxbasis_response > 1:
-        # (00|2), (0|0)(0|00)
-        fn = int3c2e.get_int3c2e_hjk
-        hja, hka = fn(intopt, 'ipip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, 'ipip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-        hj_aux_diag = hja + hjb
         if with_k:
-            hk_aux_diag = (hka + hkb)
-        hja = hjb = hka = hkb = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
+            rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
+            rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
+        
+            rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__)
+            rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__)
+            rhok0a_P__ = rhok0b_P__ = None
+
+            rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__)
+            rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__)
+            wka_ip2_P__ = wkb_ip2_P__ = None
 
     # int2c contributions
     if hessobj.auxbasis_response > 1:
@@ -267,14 +258,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
         int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
-        rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
-        # p,xp->px
-        hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        if with_j:
+            rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+            # p,xp->px
+            hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
-            rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
-            rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
-            hk_aux_diag -= contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
+            hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
         if omega and omega > 1e-10:
@@ -284,7 +274,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
         int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
-        hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
+        if with_j:
+            hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
@@ -294,33 +285,25 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     release_gpu_stack()
     # aux-aux pair
     if hessobj.auxbasis_response > 1:
-        wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
         int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
-
-        rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
-        hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
-        rhoj0_10 = rhoj0_P = None
-
-        rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
-        hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
-        hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
-        wj0_10 = rhoj1 = wj_ip2 = None
-
-        rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
-        hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
-        wj0_01 = rhoj0_01 = None
+        if with_j:
+            wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
+            rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
+            hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
+            rhoj0_10 = rhoj0_P = None
+
+            rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
+            hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
+            hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
+            wj0_10 = rhoj1 = wj_ip2 = None
+
+            rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
+            hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
+            wj0_01 = rhoj0_01 = None
 
         if with_k:
-            rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__)
-            rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__)
-            rhok0a_P__ = rhok0b_P__ = None
-
-
-            rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__)
-            rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__)
-            wka_ip2_P__ = wkb_ip2_P__ = None
             hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_11 = None
 
@@ -342,16 +325,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             hk_aux_aux -=      contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv)  # (00|1)(0|1)(0|00)
             rho2c_10= int2c_ip1_inv = None
     t1 = log.timer_debug1('contract int2c_*', *t1)
-
-    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
-    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
-    if hessobj.auxbasis_response > 1:
-        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
-        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
+    if with_j:
+        hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
+        if hessobj.auxbasis_response:
+            hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
+        if hessobj.auxbasis_response > 1:
+            hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+            hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
             hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
@@ -380,19 +361,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # -----------------------------------------
     #        collecting all
     # -----------------------------------------
-    hk_ao_ao *= 2.0
     e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ej = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ek = cupy.zeros([len(atmlst),len(atmlst),3,3])
+    ej = ek = None
+    if with_j:
+        ej = hj_ipip
+    if with_k:
+        hk_ao_ao *= 2.0
+        ek = hk_ipip
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
-        ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0)
-        if with_k:
-            ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
-            ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
+            if with_j:
+                ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1])
             if with_k:
                 ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1])
@@ -403,13 +385,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response:
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
-                if hessobj.auxbasis_response > 1:
-                    ej[i0,j0] += _ej * 2
-                    ej[j0,i0] += _ej.T * 2
-                else:
-                    ej[i0,j0] += _ej
-                    ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
+                    if hessobj.auxbasis_response > 1:
+                        ej[i0,j0] += _ej * 2
+                        ej[j0,i0] += _ej.T * 2
+                    else:
+                        ej[i0,j0] += _ej
+                        ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1])
                     if hessobj.auxbasis_response > 1:
@@ -423,13 +406,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response > 1:
             shl0, shl1, p0, p1 = auxslices[ia]
-            ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
+            if with_j:
+                ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
             if with_k:
                 ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0)
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
-                ej[i0,j0] += _ej
-                ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
+                    ej[i0,j0] += _ej
+                    ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek
@@ -437,8 +422,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
-            ej[j0,i0] = ej[i0,j0].T
-            ek[j0,i0] = ek[i0,j0].T
+            if with_j:
+                ej[j0,i0] = ej[i0,j0].T
+            if with_k:
+                ek[j0,i0] = ek[i0,j0].T
     t1 = log.timer_debug1('hcore contribution', *t1)
     log.timer('UHF partial hessian', *time0)
     return e1, ej, ek
@@ -447,25 +434,28 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
     natm = mol.natm
+    mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     if atmlst is None:
         atmlst = range(natm)
 
-    nocca, noccb = hessobj.base.nelec
-    nmo = len(mo_occ[0])
-    h1aoa = cupy.empty((natm, 3, nmo, nocca))
-    h1aob = cupy.empty((natm, 3, nmo, noccb))
-    for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                    atmlst, verbose, True):
-        h1a, h1b = h1
-        vj1a, vj1b = vj1
-        vk1a, vk1b = vk1
-
-        h1aoa[ia] = h1a + vj1a - vk1a
-        h1aob[ia] = h1b + vj1b - vk1b
-    return (h1aoa, h1aob)
-
-def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
-            verbose=None, with_k=True, omega=None):
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True)
+    vj1a, vj1b = vj1
+    vk1a, vk1b = vk1
+    h1moa = vj1a
+    h1moa-= vk1a
+    h1mob = vj1b
+    h1mob-= vk1b
+    vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
+
+    gobj = hessobj.base.nuc_grad_method()
+    h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
+    h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
+    return (h1moa, h1mob)
+
+def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
+            verbose=None, with_j=True, with_k=True, omega=None):
     '''
     A generator to produce the derivatives of Hcore, J, K matrices in MO bases
     '''
@@ -474,8 +464,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mol = hessobj.mol
     if atmlst is None:
         atmlst = range(mol.natm)
-    # FIXME
-    with_k = True
+
     mo_coeff = cupy.asarray(mo_coeff, order='C')
     mo_occ = cupy.asarray(mo_occ, order='C')
 
@@ -500,12 +489,12 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     int2c = cupy.asarray(int2c, order='C')
     # ======================= sorted AO begin ======================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, 
-                 diag_block_with_triu=True, 
-                 aosym=False, 
-                 group_size_aux=BLKSIZE, 
+    intopt.build(mf.direct_scf_tol,
+                 diag_block_with_triu=True,
+                 aosym=False, verbose=0,
+                 group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
-    
+
     mocca = intopt.sort_orbitals(mocca, axis=[0])
     moccb = intopt.sort_orbitals(moccb, axis=[0])
     mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
@@ -519,10 +508,12 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     fn = int3c2e.get_int3c2e_wjk
     dm0_tag = tag_array(dm0, occ_coeff=mocca)
-    wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega)
+    wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega)
     dm0_tag = tag_array(dm0, occ_coeff=moccb)
-    wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega)
-    rhoj0 = solve_j2c(wj)
+    wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega)
+    rhoj0 = None
+    if with_j:
+        rhoj0 = solve_j2c(wj)
     wj = None
 
     if isinstance(wka_Pl_, cupy.ndarray):
@@ -530,8 +521,11 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     else:
         rhok0a_Pl_ = np.empty_like(wka_Pl_)
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1])
-            rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            # wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1])
+            # rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wka_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0a_Pl_[:,p0:p1])
         wk_tmp = None
 
     if isinstance(wkb_Pl_, cupy.ndarray):
@@ -539,31 +533,14 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     else:
         rhok0b_Pl_ = np.empty_like(wkb_Pl_)
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1])
-            rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            #wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1])
+            #rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wkb_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0b_Pl_[:,p0:p1])
         wk_tmp = None
     wka_Pl_ = wkb_Pl_ = None
-
-    # -----------------------------
-    # int3c_ip1 contributions
-    # ------------------------------
-    cupy.get_default_memory_pool().free_all_blocks()
-    fn = int3c2e.get_int3c2e_ip1_vjk
-    dm0_tag = tag_array(dm0, occ_coeff=mocca)
-    vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega)
-    dm0_tag = tag_array(dm0, occ_coeff=moccb)
-    vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega)
-
-    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
-    vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
-    vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
-
-    vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0])
-    vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1])
-    vk1a_int3c = -contract('nxiq,ip->nxpq', vk1a_ao, mo_coeff[0])
-    vk1b_int3c = -contract('nxiq,ip->nxpq', vk1b_ao, mo_coeff[1])
-    vj1a_ao = vj1b_ao = vk1a_ao = vk1b_ao = None
-    t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
+    vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None
 
     # --------------------------
     #  int3c_ip2 contribution
@@ -572,9 +549,11 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     if hessobj.auxbasis_response:
         fn = int3c2e.get_int3c2e_ip2_vjk
         dm0_tag = tag_array(dm0, occ_coeff=mocca)
-        vj1a_int3c_ip2, vk1a_int3c_ip2 = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, omega=omega)
+        vj1a_int3c, vk1a_int3c = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices,
+                                    with_j=with_j, with_k=with_k, omega=omega)
         dm0_tag = tag_array(dm0, occ_coeff=moccb)
-        vj1b_int3c_ip2, vk1b_int3c_ip2 = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, omega=omega)
+        vj1b_int3c, vk1b_int3c = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices,
+                                    with_j=with_j, with_k=with_k, omega=omega)
 
         # Responses due to int2c2e_ip1
         if omega and omega > 1e-10:
@@ -584,34 +563,37 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
         int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
-
-        # generate rhok0_P__
-        if isinstance(rhok0a_Pl_, cupy.ndarray):
-            rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
-        else:
-            naux = auxmol.nao
-            nocc = mocca.shape[1]
-            rhok0a_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
-                rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca)
-            rhok0_Pl_tmp = None
-
-        # generate rhok0_P__
-        if isinstance(rhok0b_Pl_, cupy.ndarray):
-            rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
-        else:
-            naux = auxmol.nao
-            nocc = moccb.shape[1]
-            rhok0b_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
-                rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb)
-            rhok0_Pl_tmp = None
-
-        wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
-        wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__)
-        wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__)
+        if with_k:
+            # generate rhok0_P__
+            if isinstance(rhok0a_Pl_, cupy.ndarray):
+                rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
+            else:
+                naux = auxmol.nao
+                nocc = mocca.shape[1]
+                rhok0a_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0a_Pl_[p0:p1])
+                    rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca)
+                rhok0_Pl_tmp = None
+
+            # generate rhok0_P__
+            if isinstance(rhok0b_Pl_, cupy.ndarray):
+                rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
+            else:
+                naux = auxmol.nao
+                nocc = moccb.shape[1]
+                rhok0b_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0b_Pl_[p0:p1])
+                    rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb)
+                rhok0_Pl_tmp = None
+        if with_j:
+            wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
+        if with_k:
+            wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__)
+            wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__)
 
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
         mem_avail = get_avail_mem()
@@ -620,42 +602,76 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory to compute int3c2e_ip2')
-        
-        for p0, p1 in lib.prange(0,nao,64):
-            rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1])
-            rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1])
-            vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10)
-            vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10)
 
+        for p0, p1 in lib.prange(0,nao,blksize):
+            #rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1])
+            #rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1])
+            rhoka_tmp = copy_array(rhok0a_Pl_[:,p0:p1])
+            rhokb_tmp = copy_array(rhok0b_Pl_[:,p0:p1])
             wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp)
             wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp)
-            vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0)
-            vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0)
-            vj1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom)
-            vj1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom)
-            vj1a_tmp = vj1b_tmp = None
+            if with_j:
+                vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10)
+                vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10)
+
+                vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0)
+                vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0)
+                vj1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom)
+                vj1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom)
+                vj1a_tmp = vj1b_tmp = None
             if with_k:
                 vk1a_tmp = contract('xpio,pro->xpir', wk0a_10_Pl_, rhok0a_P__)
                 vk1a_tmp += contract('xpro,pir->xpio', wk0a_10_P__, rhoka_tmp)
                 vk1b_tmp = contract('xpio,pro->xpir', wk0b_10_Pl_, rhok0b_P__)
                 vk1b_tmp += contract('xpro,pir->xpio', wk0b_10_P__, rhokb_tmp)
 
-                vk1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom)
-                vk1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom)
+                vk1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom)
+                vk1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom)
                 vk1a_tmp = vk1b_tmp = None
             wk0a_10_Pl_ = wk0b_10_Pl_ = rhoka_tmp = rhokb_tmp = None
         wj0_10 = wk0a_10_P__ = wk0b_10_P__ = rhok0a_P__ =rhok0b_P__ = int2c_ip1 = None
-        rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None
-        aux2atom = None
 
-        vj1a_int3c += contract('nxiq,ip->nxpq', vj1a_int3c_ip2, mo_coeff[0])
-        vj1b_int3c += contract('nxiq,ip->nxpq', vj1b_int3c_ip2, mo_coeff[1])
-        if with_k:
-            vk1a_int3c += contract('nxiq,ip->nxpq', vk1a_int3c_ip2, mo_coeff[0])
-            vk1b_int3c += contract('nxiq,ip->nxpq', vk1b_int3c_ip2, mo_coeff[1])
-        vk1a_int3c_ip2 = vk1b_int3c_ip2 = None
+        aux2atom = None
         t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
 
+    # -----------------------------
+    # int3c_ip1 contributions
+    # ------------------------------
+    cupy.get_default_memory_pool().free_all_blocks()
+    fn = int3c2e.get_int3c2e_ip1_vjk
+    dm0_tag = tag_array(dm0, occ_coeff=mocca)
+    vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices,
+                                             with_j=with_j, with_k=with_k, omega=omega)
+    dm0_tag = tag_array(dm0, occ_coeff=moccb)
+    vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices,
+                                             with_j=with_j, with_k=with_k, omega=omega)
+    rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None
+
+    if with_j:
+        vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+        if not hessobj.auxbasis_response:
+            vj1a_int3c = -vj1a_ao
+            vj1b_int3c = -vj1b_ao
+        else:
+            vj1a_int3c -= vj1a_ao
+            vj1b_int3c -= vj1b_ao
+        vj1a_ao = vj1b_ao = None
+        vj1a_int3c = contract('nxiq,ip->nxpq', vj1a_int3c, mo_coeff[0])
+        vj1b_int3c = contract('nxiq,ip->nxpq', vj1b_int3c, mo_coeff[1])
+    if with_k:
+        vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
+        vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
+        if not hessobj.auxbasis_response:
+            vk1a_int3c = -vk1a_ao
+            vk1b_int3c = -vk1b_ao
+        else:
+            vk1a_int3c -= vk1a_ao
+            vk1b_int3c -= vk1b_ao
+        vk1a_ao = vk1b_ao = None
+        vk1a_int3c = contract('nxiq,ip->nxpq', vk1a_int3c, mo_coeff[0])
+        vk1b_int3c = contract('nxiq,ip->nxpq', vk1b_int3c, mo_coeff[1])
+    t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
+
     mocca = intopt.unsort_orbitals(mocca, axis=[0])
     moccb = intopt.unsort_orbitals(moccb, axis=[0])
     mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1])
@@ -666,43 +682,35 @@ def _ao2mo(mat, mocc, mo):
         tmp = contract('xij,jo->xio', mat, mocc)
         return contract('xik,ip->xpk', tmp, mo)
 
-    gobj = hessobj.base.nuc_grad_method()
-    grad_hcore_a = rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
-    grad_hcore_b = rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
     cupy.get_default_memory_pool().free_all_blocks()
 
-    vk1a = vk1b = None
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
-        vj1_ao = cupy.zeros([3,nao,nao])
-        vk1a_ao = cupy.zeros([3,nao,nao])
-        vk1b_ao = cupy.zeros([3,nao,nao])
-
-        vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
-        vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+        if with_j:
+            vj1_ao = cupy.zeros([3,nao,nao])
+            vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
+            vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+            vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0])
+            vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1])
         if with_k:
+            vk1a_ao = cupy.zeros([3,nao,nao])
+            vk1b_ao = cupy.zeros([3,nao,nao])
             vk1a_ao[:,p0:p1,:] -= vk1a_buf[:,p0:p1,:]
             vk1a_ao[:,:,p0:p1] -= vk1a_buf[:,p0:p1,:].transpose(0,2,1)
             vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:]
             vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1)
+            vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0])
+            vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1])
+    return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c)
 
-        h1a = grad_hcore_a[i0]
-        h1b = grad_hcore_b[i0]
-        vj1a = vj1a_int3c[ia] + _ao2mo(vj1_ao, mocca, mo_coeff[0])
-        vj1b = vj1b_int3c[ia] + _ao2mo(vj1_ao, moccb, mo_coeff[1])
-        if with_k:
-            vk1a = vk1a_int3c[ia] + _ao2mo(vk1a_ao, mocca, mo_coeff[0])
-            vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1])
-        yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b)
+_get_jk_mo = df_rhf_hess._get_jk_mo
 
 class Hessian(uhf_hess.Hessian):
     '''Non-relativistic restricted Hartree-Fock hessian'''
 
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = uhf_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
+    get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 1e7ee43b..059f571c 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -23,10 +23,12 @@
 import numpy
 import cupy
 from pyscf import lib
+from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.hessian import uks as uks_hess
 from gpu4pyscf.df.hessian import uhf as df_uhf_hess
+from gpu4pyscf.df.hessian.uhf import _partial_hess_ejk, _get_jk_ip
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract
 
@@ -51,17 +53,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
-    de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                                atmlst, max_memory, verbose,
-                                                with_k=with_k)
+    de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                    atmlst, max_memory, verbose,
+                                    with_j=True, with_k=with_k)
     de2 += ej  # (A,B,dR_A,dR_B)
     if with_k:
         de2 -= hyb * ek
 
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        ek_lr = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                            atmlst, max_memory, verbose,
-                                            True, omega=omega)[2]
+        ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                  atmlst, max_memory, verbose,
+                                  with_j=False, with_k=True, omega=omega)[2]
         de2 -= (alpha - hyb) * ek_lr
 
     max_memory = None
@@ -89,40 +91,50 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     mf = hessobj.base
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
-    for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, with_k):
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                          atmlst, verbose, with_j=True, with_k=True)
+    vj1a, vj1b = vj1
+    h1moa = vj1a
+    h1mob = vj1b
+
+    if with_k:
+        vk1a, vk1b = vk1
+        h1moa -= hyb * vk1a
+        h1mob -= hyb * vk1b
+    vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
 
-        h1moa[ia] += h1[0] + vj1[0]
-        h1mob[ia] += h1[1] + vj1[1]
-        if with_k:
-            vk1a, vk1b = vk1
-            h1moa[ia] -= hyb * vk1a
-            h1mob[ia] -= hyb * vk1b
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        for ia, h1, vj1_lr, vk1_lr in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, True, omega=omega):
-            vk1a, vk1b = vk1_lr
-            h1moa[ia] -= (alpha - hyb) * vk1a
-            h1mob[ia] -= (alpha - hyb) * vk1b
+        _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                               atmlst, verbose, with_j=False, with_k=True, omega=omega)
+        vk1a, vk1b = vk1_lr
+        h1moa -= (alpha - hyb) * vk1a
+        h1mob -= (alpha - hyb) * vk1b
+
+    gobj = hessobj.base.nuc_grad_method()
+    h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
+    h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
+
+    v1moa, v1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    h1moa += v1moa
+    h1mob += v1mob
     return h1moa, h1mob
 
 class Hessian(uks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = uks_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    hess_elec = uhf_hess.hess_elec
-    kernel = rhf_hess.kernel
-    hess = kernel
+    get_jk_mo = df_uhf_hess._get_jk_mo
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 3bb6c916..e77e30ca 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -20,8 +20,8 @@
 from pyscf import gto, df, lib
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint
-from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, 
-                                       reduce_to_device)
+from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem,
+                                       reduce_to_device, copy_array, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
 from gpu4pyscf.__config__ import _num_devices, _streams
@@ -29,7 +29,7 @@
 LMAX_ON_GPU = 8
 FREE_CUPY_CACHE = True
 STACK_SIZE_PER_THREAD = 8192 * 4
-BLKSIZE = 128
+BLKSIZE = 256
 NROOT_ON_GPU = 7
 
 def make_fake_mol():
@@ -103,8 +103,8 @@ def __del__(self):
         except AttributeError:
             pass
 
-    def build(self, cutoff=1e-14, group_size=None,
-              group_size_aux=None, diag_block_with_triu=False, aosym=False):
+    def build(self, cutoff=1e-14, group_size=None, group_size_aux=None, 
+              diag_block_with_triu=False, aosym=False, verbose=None):
         '''
         int3c2e is based on int2e with (ao,ao|aux,1)
         a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
@@ -116,7 +116,9 @@ def build(self, cutoff=1e-14, group_size=None,
         mol = basis_seg_contraction(_mol, allow_replica=True)[0]
         auxmol = basis_seg_contraction(_auxmol, allow_replica=True)[0]
         
-        log = logger.new_logger(_mol, _mol.verbose)
+        if verbose is None:
+            verbose = _mol.verbose
+        log = logger.new_logger(_mol, verbose)
         cput0 = log.init_timer()
         _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
 
@@ -181,7 +183,7 @@ def build(self, cutoff=1e-14, group_size=None,
 
         aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart)
         ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1])
-        self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])        
+        self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
         cput1 = log.timer_debug1('Aux AO indices', *cput1)
 
         ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart)
@@ -218,28 +220,10 @@ def build(self, cutoff=1e-14, group_size=None,
         self.pair2bra = pair2bra
         self.pair2ket = pair2ket
         self.l_ctr_offsets = l_ctr_offsets
-        bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1)
-        bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
-        log_qs = log_qs + aux_log_qs
-        ao_loc = _tot_mol.ao_loc_nr(cart=True)
-        ncptype = len(log_qs)
 
         self._bpcache = {}
-        for n in range(_num_devices):
-            with cupy.cuda.Device(n), _streams[n]:
-                bpcache = ctypes.POINTER(BasisProdCache)()
-                scale_shellpair_diag = 1.
-                libgint.GINTinit_basis_prod(
-                    ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag),
-                    ao_loc.ctypes.data_as(ctypes.c_void_p),
-                    bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
-                    bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
-                    _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
-                    _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
-                    _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
-                self._bpcache[n] = bpcache
 
-        cput1 = log.timer_debug1('Initialize GPU cache', *cput1)
+        bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
         self.bas_pairs_locs = bas_pairs_locs
         ncptype = len(self.log_qs)
         self.aosym = aosym
@@ -260,10 +244,31 @@ def build(self, cutoff=1e-14, group_size=None,
 
         self._sorted_mol = _sorted_mol
         self._sorted_auxmol = _sorted_auxmol
-    
+
     @property
     def bpcache(self):
         device_id = cupy.cuda.Device().id
+        if device_id not in self._bpcache:
+            with cupy.cuda.Device(device_id), _streams[device_id]:
+                log = logger.new_logger(self.mol, self.mol.verbose)
+                cput0 = log.init_timer()
+                bpcache = ctypes.POINTER(BasisProdCache)()
+                scale_shellpair_diag = 1.
+                _tot_mol = self._tot_mol
+                log_qs = self.log_qs + self.aux_log_qs
+                ao_loc = _tot_mol.ao_loc_nr(cart=True)
+                bas_pair2shls = np.hstack(self.pair2bra + self.pair2ket).astype(np.int32).reshape(2,-1)
+                ncptype = len(log_qs)
+                libgint.GINTinit_basis_prod(
+                    ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag),
+                    ao_loc.ctypes.data_as(ctypes.c_void_p),
+                    bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+                    self.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
+                    _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
+                    _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
+                    _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+                self._bpcache[device_id] = bpcache
+                cput0 = log.timer_debug1(f'Initialize GPU cache on Device {device_id}', *cput0)
         bpcache = self._bpcache[device_id]
         return bpcache
 
@@ -310,15 +315,15 @@ def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]):
         mat = cupy.empty_like(sorted_mat)
         mat[tuple(fancy_index)] = sorted_mat
         return mat
-    
+
     @property
     def cart2sph(self):
         return block_c2s_diag(self.angular, self.l_ctr_counts)
-    
+
     @property
     def aux_cart2sph(self):
         return block_c2s_diag(self.aux_angular, self.aux_l_ctr_counts)
-    
+
     @property
     def coeff(self):
         nao = self.mol.nao
@@ -339,36 +344,45 @@ def aux_coeff(self):
             self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1])
         return self._aux_coeff
 
-def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
+def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True, with_k=True):
     log = logger.new_logger(mol, mol.verbose)
     intopt = VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(thred, diag_block_with_triu=True, aosym=True,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
     orbo = dm0_tag.occ_coeff
     nao = mol.nao
     naux = auxmol.nao
     nocc = orbo.shape[1]
-    wj = cupy.empty([naux])
-    avail_mem = get_avail_mem()
-    use_gpu_memory = True
-    if naux*nao*nocc*8 < 0.4*avail_mem:
-        try:
-            wk = cupy.empty([naux,nao,nocc])
-        except Exception:
+
+    wj = None
+    if with_j:
+        wj = cupy.empty([naux])
+
+    wk = None
+    if with_k:
+        avail_mem = get_avail_mem()
+        use_gpu_memory = True
+        if naux*nao*nocc*8 < 0.4*avail_mem:
+            try:
+                wk = cupy.empty([naux,nao,nocc])
+            except Exception:
+                use_gpu_memory = False
+        else:
             use_gpu_memory = False
-    else:
-        use_gpu_memory = False
-    
-    if not use_gpu_memory:
-        log.debug('Saving int3c2e_wjk on CPU memory')
-        mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8)
-        wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem)
+
+        if not use_gpu_memory:
+            log.debug('Saving int3c2e_wjk on CPU memory')
+            mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8)
+            wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem)
 
     # TODO: async data transfer
     for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
         k0 = intopt.aux_ao_loc[cp_kl_id]
         k1 = intopt.aux_ao_loc[cp_kl_id+1]
-        rhoj_tmp = cupy.zeros([k1-k0], order='C')
-        rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C')
+        if with_j:
+            rhoj_tmp = cupy.zeros([k1-k0], order='C')
+        if with_k:
+            rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C')
 
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
@@ -381,20 +395,23 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
             i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
             j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
-
-            tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1])
-            rhoj_tmp += tmp
-            rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1])
-
-            if cpi != cpj and intopt.aosym:
+            if with_j:
+                tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1])
                 rhoj_tmp += tmp
-                rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1])
-        wj[k0:k1] = rhoj_tmp
+                if cpi != cpj:
+                    rhoj_tmp += tmp
+            if with_k:
+                rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1])
+                if cpi != cpj:
+                    rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1])
+        if with_j:
+            wj[k0:k1] = rhoj_tmp
         if with_k:
             if isinstance(wk, cupy.ndarray):
                 wk[k0:k1] = rhok_tmp
             else:
-                rhok_tmp.get(out=wk[k0:k1])
+                #rhok_tmp.get(out=wk[k0:k1])
+                copy_array(rhok_tmp, wk[k0:k1])
     return wj, wk
 
 def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None, stream=None):
@@ -484,16 +501,6 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
     comp = 3**order
-
-    lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
-    aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
-    nroots = (lmax + aux_lmax + order)//2 + 1
-    if nroots > NROOT_ON_GPU:
-        from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt._tot_mol
-        intor = pmol._add_suffix('int3c2e_' + ip_type)
-        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-
     nbins = 1
 
     # If task_list is not given, generate all the tasks
@@ -505,7 +512,7 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
     for aux_id, cp_ij_id in task_list:
         cp_kl_id = aux_id + len(intopt.log_qs)
         lk = intopt.aux_angular[aux_id]
-        
+
         cpi = intopt.cp_idx[cp_ij_id]
         cpj = intopt.cp_jdx[cp_ij_id]
         li = intopt.angular[cpi]
@@ -546,6 +553,11 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
             if err != 0:
                 raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}')
         else:
+            from pyscf.gto.moleintor import getints, make_cintopt
+            pmol = intopt._tot_mol
+            intor = pmol._add_suffix('int3c2e_' + ip_type)
+            opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
             # TODO: sph2cart in CPU?
             ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
             jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
@@ -670,26 +682,26 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None):
     get rhoj pass1 for int3c2e
     '''
     if stream is None: stream = cupy.cuda.get_current_stream()
-    
+
     n_dm = 1
 
     naux = intopt._sorted_auxmol.nao
-    
+
     coeff = intopt.coeff
     if dm0.ndim == 3:
         dm0 = dm0[0] + dm0[1]
     dm_cart = coeff @ dm0 @ coeff.T
-    
+
     num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
     num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs]
 
     bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32)
     bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32)
-    
+
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
     norb = dm_cart.shape[0]
-    
+
     rhoj = cupy.zeros([naux])
 
     err = libgvhf.GINTbuild_j_int3c2e_pass1(
@@ -706,7 +718,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None):
         ctypes.c_int(ncp_kl))
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass1')
-    
+
     if sort_j:
         aux_coeff = intopt.aux_coeff
         rhoj = cupy.dot(rhoj, aux_coeff)
@@ -731,7 +743,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
 
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
-    
+
     rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0])
     if not intopt.auxmol.cart:
         rhoj = intopt.aux_cart2sph @ rhoj
@@ -751,7 +763,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
 
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass2')
-    
+
     if not intopt.mol.cart:
         cart2sph = intopt.cart2sph
         vj = cart2sph.T @ vj @ cart2sph
@@ -759,6 +771,48 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
     vj = vj + vj.T
     return vj
 
+def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
+        mocc = cupy.asarray(mocc)
+        dm0 = cupy.asarray(dm0)
+        naux = intopt.auxmol.nao
+        nocc = mocc.shape[1]
+        rhoj = cupy.zeros([naux])
+        rhok = cupy.zeros([naux,nocc,nocc])
+        for cp_kl_id in task_k_list:
+            k0 = intopt.aux_ao_loc[cp_kl_id]
+            k1 = intopt.aux_ao_loc[cp_kl_id+1]
+            rhoj_tmp = cupy.zeros([k1-k0], order='C')
+            rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C')
+            for cp_ij_id, _ in enumerate(intopt.log_qs):
+                cpi = intopt.cp_idx[cp_ij_id]
+                cpj = intopt.cp_jdx[cp_ij_id]
+                li = intopt.angular[cpi]
+                lj = intopt.angular[cpj]
+                int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
+                if not intopt.mol.cart:
+                    int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
+                    int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
+                i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+                j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+                if cpi == cpj and intopt.aosym:
+                    int3c_blk *= 0.5
+
+                rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1])
+                ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1])
+                rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1])
+                int3c_blk = ints_o = None
+            if intopt.aosym:
+                rhoj[k0:k1] = 2.0 * rhoj_tmp
+                rhok[k0:k1] = transpose_sum(rhok_tmp)
+            else:
+                rhoj[k0:k1] = rhoj_tmp
+                rhok[k0:k1] = rhok_tmp
+        t0 = log.timer_debug1(f'int3c2e_vjk on Device {device_id}', *t0)
+    return rhoj, rhok
+
 def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     '''
     get rhoj and rhok for int3c2e
@@ -766,109 +820,132 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     intopt = VHFOpt(mol, auxmol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
-    if omega is None: omega = 0.0
-    naux = auxmol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    nocc = orbo.shape[1]
-    rhoj = cupy.empty([naux])
-    rhok = cupy.empty([naux,nocc,nocc])
+    futures = []
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
-    for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
-        k0 = intopt.aux_ao_loc[cp_kl_id]
-        k1 = intopt.aux_ao_loc[cp_kl_id+1]
-        rhoj_tmp = cupy.zeros([k1-k0], order='C')
-        rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C')
-        for cp_ij_id, _ in enumerate(intopt.log_qs):
-            cpi = intopt.cp_idx[cp_ij_id]
-            cpj = intopt.cp_jdx[cp_ij_id]
-            li = intopt.angular[cpi]
-            lj = intopt.angular[cpj]
-            int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
-            if not intopt.mol.cart:
-                int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
-                int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
-            i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
-            j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
-            if cpi == cpj and intopt.aosym:
-                int3c_blk *= 0.5
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _int3c2e_jk_task, intopt, task_list[device_id],
+                dm0_tag, orbo, device_id=device_id, omega=omega)
+            futures.append(future)
 
-            rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0_tag[i0:i1,j0:j1])
-            ints_o = contract('pji,jo->poi', int3c_blk, orbo[j0:j1])
-            rhok_tmp += contract('poi,ir->por', ints_o, orbo[i0:i1])
+    rhoj_total = []
+    rhok_total = []
+    for future in futures:
+        rhoj, rhok = future.result()
+        rhoj_total.append(rhoj)
+        rhok_total.append(rhok)
 
-        if intopt.aosym:
-            rhoj[k0:k1] = 2.0 * rhoj_tmp
-            rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1])
-        else:
-            rhoj[k0:k1] = rhoj_tmp
-            rhok[k0:k1] = rhok_tmp
+    rhoj = rhok = None
+    rhoj = reduce_to_device(rhoj_total, inplace=True)
+    if with_k:
+        rhok = reduce_to_device(rhok_total, inplace=True)
     return rhoj, rhok
 
-def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None):
+def _split_tasks(loads, ngroups):
+    ''' Split a list of numbers into sublists with sums as close as possible
+    '''
+    if ngroups == 1:
+        return [range(len(loads))]
+    groups = [[] for _ in range(ngroups)]
+    sums = [0] * ngroups
+
+    sorted_indices = np.argsort(loads)[::-1]
+    for idx in sorted_indices:
+        min_index = sums.index(min(sums))
+        groups[min_index].append(idx)
+        sums[min_index] += loads[idx]
+    return groups
+
+def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0,
+                          with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
     aoslices = intopt.mol.aoslice_by_atom()
+    vj1_buf = vk1_buf = vj1 = vk1 = None
+
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         ao2atom = get_ao2atom(intopt, aoslices)
-        rhoj = cupy.asarray(rhoj)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         nocc = orbo.shape[1]
-        vj1_buf = cupy.zeros([3,nao,nao])
-        vk1_buf = cupy.zeros([3,nao,nao])
-        vj1 = cupy.zeros([natom,3,nao,nocc])
-        vk1 = cupy.zeros([natom,3,nao,nocc])
+        if with_j:
+            rhoj = cupy.asarray(rhoj)
+            vj1_buf = cupy.zeros([3,nao,nao])
+            vj1 = cupy.zeros([natom,3,nao,nocc])
+        if with_k:
+            vk1_buf = cupy.zeros([3,nao,nao])
+            vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
-            rhok_tmp = cupy.asarray(rhok[k0:k1])
+            #rhok_tmp = cupy.asarray(rhok[k0:k1])
+            rhok_tmp = copy_array(rhok[k0:k1])
             if with_k:
                 rhok0 = contract('pio,ir->pro', rhok_tmp, orbo)
                 rhok0 = contract('pro,Jo->prJ', rhok0, orbo)
-            rhoj0 = cupy.zeros([3,k1-k0,nao])
-            int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc])
+                int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc])
+            if with_j:
+                rhoj0 = cupy.zeros([3,k1-k0,nao])
+
             for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
                                                                      ip_type='ip1', omega=omega):
-                vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
-                rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-                int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-
+                if with_j:
+                    vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
+                    rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
                 if with_k:
+                    int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+
                     vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
                     vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])
-
-                    int3c_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-                    rhok0_slice = contract('pJr,ir->pJi', rhok_tmp, orbo[i0:i1])
-
-                    vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice)
-                    vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1])
-                    vk1_ao = int3c_occ = None
-            rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
-            vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
-            rhoj0_atom = None
-            vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
+                    vk1_ao = int3c_blk = None
+            if with_j:
+                rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
+                vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
+                rhoj0_atom = rhoj0 = None
+            if with_k:
+                rhok0 = None
+                vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
+                mem_avail = get_avail_mem()
+                blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8),
+                              int(mem_avail * 0.2 / (nocc * nao * 3 * 8)))
+                for p0, p1, in lib.prange(0, nao, blksize):
+                    rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo)
+                    vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice)
+                    vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom)
+                    rhok0_slice = vk1_ao = None
+            rhok_tmp = int3c_ip1_occ = None
+        t0 = log.timer_debug1(f'int3c2e_ip1_vjk on Device {device_id}', *t0)
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
 
-def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omega=None):
+def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
+                        with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip1_vjk_task, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
+                _int3c2e_ip1_vjk_task, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
+                device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     vj1_buf_total = []
     vk1_buf_total = []
     vj1_total = []
@@ -879,48 +956,61 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
         vk1_buf_total.append(vk1_buf)
         vj1_total.append(vj1)
         vk1_total.append(vk1)
-        
+
     vj1 = vk1 = vj1_buf = vk1_buf = None
-    vj1 = reduce_to_device(vj1_total, inplace=True)
-    vj1_buf = reduce_to_device(vj1_buf_total, inplace=True)
+    if with_j:
+        vj1 = reduce_to_device(vj1_total, inplace=True)
+        vj1_buf = reduce_to_device(vj1_buf_total, inplace=True)
     if with_k:
         vk1 = reduce_to_device(vk1_total, inplace=True)
         vk1_buf = reduce_to_device(vk1_buf_total, inplace=True)
     return vj1_buf, vk1_buf, vj1, vk1
 
 
-def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None):
+def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo,
+                          device_id=0, with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
     auxslices = intopt.auxmol.aoslice_by_atom()
+    vj1 = vk1 = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         aux2atom = get_aux2atom(intopt, auxslices)
-        rhoj = cupy.asarray(rhoj)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         nocc = orbo.shape[1]
-        vj1 = cupy.zeros([natom,3,nao,nocc])
-        vk1 = cupy.zeros([natom,3,nao,nocc])
+        if with_j:
+            rhoj = cupy.asarray(rhoj)
+            vj1 = cupy.zeros([natom,3,nao,nocc])
+        if with_k:
+            vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
-            wj2 = cupy.zeros([3,k1-k0])
+            if with_j:
+                wj2 = cupy.zeros([3,k1-k0])
+
             wk2_P__ = cupy.zeros([3,k1-k0,nao,nocc])
             for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
                                                                      ip_type='ip2', omega=omega):
                 # contraction
-                wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1])
-                wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-            rhok_tmp = cupy.asarray(rhok[k0:k1])
-            vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
-            vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
+                if with_j:
+                    wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1])
 
-            vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+                wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+                int3c_blk = None
+            #rhok_tmp = cupy.asarray(rhok[k0:k1])
+            rhok_tmp = copy_array(rhok[k0:k1])
+            if with_j:
+                vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
+                vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
+
+                vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+                vj1_tmp = wj2 = None
             if with_k:
-                #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
-                #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice)
                 rhok0_slice = contract('xpjo,jr->xpro', wk2_P__, orbo)
                 vk1_tmp = -contract('xpro,pir->xpio', rhok0_slice, rhok_tmp)
 
@@ -928,54 +1018,59 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
                 vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo)
 
                 vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
-            wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None
-            rhok_tmp = vk1_tmp = None
+                vk1_tmp = rhok0_oo = rhok0_slice = None
+            rhok_tmp = wk2_P__ = None
+        t0 = log.timer_debug1(f'int3c2e_ip2_vjk on Device {device_id}', *t0)
     return vj1, vk1
 
-def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None):
+def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
+                        with_j=True, with_k=True, omega=None):
     '''
     vj and vk responses (due to int3c2e_ip2) to changes in atomic positions
     '''
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip2_vjk_task, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
+                _int3c2e_ip2_vjk_task, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j,
+                with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     vj_total = []
     vk_total = []
     for future in futures:
         vj, vk = future.result()
         vj_total.append(vj)
         vk_total.append(vk)
-        
+
     vj = vk = None
-    vj = reduce_to_device(vj_total, inplace=True)
+    if with_j:
+        vj = reduce_to_device(vj_total, inplace=True)
     if with_k:
         vk = reduce_to_device(vk_total, inplace=True)
     return vj, vk
 
-def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None):
+def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None):
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
     aux_ao_loc = intopt.aux_ao_loc
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         ncp_ij = len(intopt.log_qs)
         nocc = orbo.shape[1]
         wj = cupy.zeros([naux,nao,3])
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
             if with_k:
                 wk_tmp = cupy.zeros([k1-k0,nao,nocc,3])
@@ -985,8 +1080,12 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=
                 wj[k0:k1,i0:i1] += contract('xpji,ij->pix', int3c_blk, dm0[i0:i1,j0:j1])
                 if with_k:
                     wk_tmp[:,i0:i1] += contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
+                int3c_blk = None
             if with_k:
-                wk_tmp.get(out=wk[k0:k1])
+                #wk_tmp.get(out=wk[k0:k1])
+                copy_array(wk_tmp, wk[k0:k1])
+            wk_tmp = None
+        t0 = log.timer_debug1(f'int3c2e_ip1_wjk on Device {device_id}', *t0)
     return wj
 
 def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
@@ -994,12 +1093,11 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     '''
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
     nocc = orbo.shape[1]
@@ -1012,7 +1110,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip1_wjk_task, intopt, task_list[device_id], 
+                _int3c2e_ip1_wjk_task, intopt, task_list[device_id],
                 dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
     wj_total = []
@@ -1023,7 +1121,12 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     return wj, wk
 
 def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0):
+    aux_ao_loc = intopt.aux_ao_loc
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        cupy.get_default_memory_pool().free_all_blocks()
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
+        ncp_ij = len(intopt.log_qs)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         naux = intopt.auxmol.nao
@@ -1032,24 +1135,29 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi
         wk = None
         if with_k:
             wk = cupy.zeros([naux,nocc,nocc,3])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ip2', omega=omega):
-            wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1])
-            tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
-            if with_k:
-                wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
+        for cp_k in task_list:
+            k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
+            task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
+
+            for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
+                                                                    ip_type='ip2', omega=omega):
+                wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1])
+                if with_k:
+                    tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
+                    wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
+                    tmp = None
+                int3c_blk = None
+        t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0)
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    ncp_ij = len(intopt.log_qs)
-    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
@@ -1057,205 +1165,20 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
                 _int3c2e_ip2_wjk, intopt, task_list[device_id],
                 dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     wj_total = []
     wk_total = []
     for future in futures:
         wj, wk = future.result()
         wj_total.append(wj)
         wk_total.append(wk)
-        
+
     wj = wk = None
     wj = reduce_to_device(wj_total, inplace=True)
     if with_k:
         wk = reduce_to_device(wk_total, inplace=True)
     return wj, wk
 
-def _int3c2e_ipip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo,
-                       device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ipip1', omega=omega):
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,3,3])
-        if with_k:
-            hk = hk.reshape([nao,3,3])
-    return hj, hk
-
-def _int3c2e_ipvip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                        device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,nao,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,nao,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ipvip1', omega=omega):
-            tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,nao,3,3])
-        if with_k:
-            hk = hk.reshape([nao,nao,3,3])
-    return hj, hk
-
-def _int3c2e_ip1ip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                        device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        naux = rhok.shape[0]
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,naux,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,naux,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ip1ip2', omega=omega):
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,naux,3,3])
-        if with_k:
-            hk = hk.reshape([nao,naux,3,3])
-    return hj, hk
-
-def _int3c2e_ipip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                       device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        naux = rhok.shape[0]
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        hj = cupy.zeros([naux,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([naux,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, 
-                                                                ip_type='ipip2', omega=omega):
-            tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
-                rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
-                hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
-        hj = hj.reshape([naux,3,3])
-        if with_k:
-            hk = hk.reshape([naux,3,3])
-    return hj, hk
-
-def get_int3c2e_hjk(intopt, task_type, rhoj, rhok, dm0_tag, with_k=True, omega=None):
-    if task_type == 'ipip1':  task_fn = _int3c2e_ipip1_hjk
-    if task_type == 'ipip2':  task_fn = _int3c2e_ipip2_hjk
-    if task_type == 'ip1ip2': task_fn = _int3c2e_ip1ip2_hjk
-    if task_type == 'ipvip1': task_fn = _int3c2e_ipvip1_hjk
-
-    orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    ncp_ij = len(intopt.log_qs)
-    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
-    cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
-            future = executor.submit(
-                task_fn, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
-            futures.append(future)
-    
-    hj_total = []
-    hk_total = []
-    for future in futures:
-        hj, hk = future.result()
-        hj_total.append(hj)
-        hk_total.append(hk)
-        
-    hj = hk = None
-    hj = reduce_to_device(hj_total, inplace=True)
-    if with_k:
-        hk = reduce_to_device(hk_total, inplace=True)
-    return hj, hk
-
-def get_hess_nuc_elec(mol, dm):
-    '''
-    calculate int1e_ipiprinv contribution
-    '''
-    coords = mol.atom_coords()
-    charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
-
-    fakemol = gto.fakemol_for_charges(coords)
-    fakemol.output = mol.output
-    fakemol.verbose = mol.verbose
-    fakemol.stdout = mol.stdout
-    intopt = VHFOpt(mol, fakemol, 'int2e')
-    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
-
-    natm = mol.natm
-    nao = mol.nao
-    hcore_diag = cupy.zeros([9,natm])
-    hcore_aa = cupy.zeros([9,natm,nao])
-    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
-        haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
-        hcore_aa[:,k0:k1,i0:i1] += haa
-        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
-
-    hcore_ab = cupy.zeros([9,natm,nao])
-    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'):
-        hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
-        hcore_ab[:,k0:k1,i0:i1] += hab
-        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
-
-    hcore_diag = contract('xp,p->xp', hcore_diag, charges)
-    hcore_aa = contract('xpj,p->xpj', hcore_aa, charges)
-    hcore_ab = contract('xpj,p->xpj', hcore_ab, charges)
-
-    aoslices = mol.aoslice_by_atom()
-    ao2atom = get_ao2atom(intopt, aoslices)
-
-    hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm])
-    hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm])
-    hcore = hcore_aa + hcore_aa.transpose([1,0,3,2])
-    hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2])
-    hcore_diag = hcore_diag.reshape([3,3,natm])
-    idx = np.arange(natm)
-    for x in range(3):
-        for y in range(3):
-            hcore[x,y,idx,idx] += hcore_diag[x,y]
-    return hcore
-
 def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, stream=None):
     '''
     Generate int3c2e_ip slice along k, full dimension in ij
@@ -1414,15 +1337,6 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
     intopt = VHFOpt(mol, auxmol, 'int2e')
     intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
-    lmax = mol._bas[:gto.ANG_OF].max()
-    aux_lmax = auxmol._bas[:gto.ANG_OF].max()
-    nroots = (lmax + aux_lmax + order)//2 + 1
-    if nroots > NROOT_ON_GPU:
-        from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt._tot_mol
-        intor = pmol._add_suffix('int3c2e_' + ip_type)
-        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-
     nao_cart = intopt._sorted_mol.nao
     naux_cart = intopt._sorted_auxmol.nao
     norb_cart = nao_cart + naux_cart + 1
@@ -1472,6 +1386,11 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
                 if err != 0:
                     raise RuntimeError("int3c2e failed\n")
             else:
+                from pyscf.gto.moleintor import getints, make_cintopt
+                pmol = intopt._tot_mol
+                intor = pmol._add_suffix('int3c2e_' + ip_type)
+                opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
                 # TODO: sph2cart in CPU?
                 ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
                 jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
@@ -1562,7 +1481,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
     nbins = 1
     bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
     bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32)
-    
+
     cart_ao_loc = intopt.cart_ao_loc
     cart_aux_loc = intopt.cart_aux_loc
     i0, i1 = cart_ao_loc[cpi], cart_ao_loc[cpi+1]
@@ -1604,11 +1523,11 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
 
     if err != 0:
         raise RuntimeError('GINT_fill_int2e failed')
-    
+
     # move this operation to j2c?
     if lk > 1 and intopt.auxmol.cart == 0:
         int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out)
-    
+
     stream.synchronize()
 
     return int3c_blk
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index 8e254c67..266cef29 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -135,7 +135,7 @@ def test_hessian_rhf(self, disp=None):
         h = hobj.kernel()
         _check_rhf_hessian(mf, h, ix=0, iy=0)
         _check_rhf_hessian(mf, h, ix=0, iy=1)
-
+    
     def test_hessian_lda(self, disp=None):
         print('-----testing DF LDA Hessian----')
         mf = _make_rks(mol_sph, 'LDA')
@@ -239,7 +239,6 @@ def test_hessian_rks_D3(self):
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
         h = hobj.kernel()
-        print(np.linalg.norm(h))
         _check_dft_hessian(mf, h, ix=0,iy=0)
 
     def test_hessian_rks_D4(self):
diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py
index e724015a..c2f3caa9 100644
--- a/gpu4pyscf/df/tests/test_df_rhf.py
+++ b/gpu4pyscf/df/tests/test_df_rhf.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import unittest
+import pickle
 import numpy as np
 import pyscf
 from pyscf import scf as cpu_scf
 from pyscf.df import df_jk as cpu_df_jk
 from gpu4pyscf.df import df_jk as gpu_df_jk
 from gpu4pyscf import scf as gpu_scf
+try:
+    import cloudpickle
+except ImportError:
+    cloudpickle = None
 
 atom = '''
 O       0.0000000000    -0.0000000000     0.1174000000
@@ -48,12 +53,17 @@ class KnownValues(unittest.TestCase):
     '''
     def test_rhf(self):
         print('------- RHF -----------------')
-        mf = gpu_scf.RHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit')
+        mf = mol_sph.RHF().density_fit(auxbasis='def2-tzvpp-jkfit').to_gpu()
         e_tot = mf.kernel()
         e_qchem = -76.0624582299
         print(f'diff from qchem {e_tot - e_qchem}')
         assert np.abs(e_tot - e_qchem) < 1e-5
 
+        # test serialization
+        if cloudpickle is not None:
+            mf1 = pickle.loads(cloudpickle.dumps(mf))
+            assert mf1.e_tot == e_tot
+
     def test_cart(self):
         print('------- RHF Cart -----------------')
         mf = gpu_scf.RHF(mol_cart).density_fit(auxbasis='def2-tzvpp-jkfit')
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 70186a5a..17498c7d 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -32,11 +32,11 @@
 
 LMAX_ON_GPU = 6
 BAS_ALIGNED = 1
-GRID_BLKSIZE = 32
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
 AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
 AO_THRESHOLD = 1e-10
+GB = 1024*1024*1024
 
 # Should we release the cupy cache?
 FREE_CUPY_CACHE = False
@@ -273,26 +273,23 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0,
     na = mo1.shape[0]
     if xctype == 'LDA' or xctype == 'HF':
         c0 = mo0.T.dot(ao)
-        t1 = log.timer_debug2('eval occ_coeff', *t0)
-        c_0 = contract('aio,ig->aog', mo1, ao)
         rho = cupy.empty([na,ngrids])
         for i in range(na):
-            rho[i] = _contract_rho(c0, c_0[i])
+            c_0 = contract('io,ig->og', mo1[i], ao)
+            rho[i] = _contract_rho(c0, c_0)
     elif xctype in ('GGA', 'NLC'):
         c0 = contract('nig,io->nog', ao, mo0)
-        t1 = log.timer_debug2('eval occ_coeff', *t0)
-        c_0 = contract('nig,aio->anog', ao, mo1)
-        t1 = log.timer_debug2('ao * cpos', *t1)
         rho = cupy.empty([na, 4, ngrids])
         for i in range(na):
-            _contract_rho_gga(c0, c_0[i], rho=rho[i])
+            c_0 = contract('nig,io->nog', ao, mo1[i])
+            _contract_rho_gga(c0, c_0, rho=rho[i])
     else: # meta-GGA
         assert not with_lapl
         rho = cupy.empty((na,5,ngrids))
         c0 = contract('nig,io->nog', ao, mo0)
-        c_0 = contract('nig,aio->anog', ao, mo1)
         for i in range(na):
-            _contract_rho_mgga(c0, c_0[i], rho=rho[i])
+            c_0 = contract('nig,io->nog', ao, mo1[i])
+            _contract_rho_mgga(c0, c_0, rho=rho[i])
     if hermi:
         # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao
         rho *= 2.
@@ -417,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
-        grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         weights = cupy.empty([ngrids_local])
         if xctype == 'LDA':
@@ -428,7 +427,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             rho_tot = cupy.empty([nset,4,ngrids_local])
         else:
             rho_tot = cupy.empty([nset,5,ngrids_local])
-
+        
         p0 = p1 = 0
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
@@ -436,8 +435,10 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             p1 = p0 + weight.size
             weights[p0:p1] = weight
             for i in range(nset):
+                # If AO is sparse enough, use density matrix to calculate rho
                 if mo_coeff is None:
-                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx],
+                    dms_mask = dms[i][idx[:,None],idx]
+                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask,
                                                 xctype=xctype, hermi=hermi, with_lapl=with_lapl)
                 else:
                     assert hermi == 1
@@ -446,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                                                 None, xctype, with_lapl)
             p0 = p1
         t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0)
-
+        
         # libxc calls are still running on default stream
         nelec = cupy.zeros(nset)
         excsum = cupy.zeros(nset)
@@ -817,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
-        grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
@@ -1019,13 +1023,16 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
-        grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()
         for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
-                                                       max_memory=None,
+                                                       max_memory=None, blksize=None,
                                                        grid_range=(grid_start, grid_end)):
             p0, p1 = p1, p1+len(weights)
             # precompute molecular orbitals
@@ -1133,6 +1140,105 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
     return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc,
                       max_memory=max_memory, verbose=verbose)
 
+def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
+                     verbose=None, hermi=1, device_id=0):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        if dms is not None: 
+            dma, dmb = dms
+            dma = cupy.asarray(dma)
+            dmb = cupy.asarray(dmb)
+        if mo1 is not None: 
+            mo1a, mo1b = mo1
+            mo1a = cupy.asarray(mo1a)
+            mo1b = cupy.asarray(mo1b)
+        if occ_coeff is not None: 
+            occ_coeff_a, occ_coeff_b = occ_coeff
+            occ_coeff_a = cupy.asarray(occ_coeff_a)
+            occ_coeff_b = cupy.asarray(occ_coeff_b)
+
+        if fxc is not None: fxc = cupy.asarray(fxc)
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
+        xctype = ni._xc_type(xc_code)
+        opt = getattr(ni, 'gdftopt', None)
+
+        _sorted_mol = opt.mol
+        nao = mol.nao
+        nset = len(dma)
+        vmata = cupy.zeros((nset, nao, nao))
+        vmatb = cupy.zeros((nset, nao, nao))
+
+        if xctype == 'LDA':
+            ao_deriv = 0
+        else:
+            ao_deriv = 1
+
+        ngrids_glob = grids.coords.shape[0]
+        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
+
+        p0 = p1 = grid_start
+        t1 = t0 = log.init_timer()
+        for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, 
+                                                  max_memory=None,
+                                                  grid_range=(grid_start, grid_end)):
+            
+            t0 = log.init_timer()
+            p0, p1 = p1, p1+len(weights)
+            # precompute fxc_w
+            fxc_w = fxc[:,:,:,:,p0:p1] * weights
+
+            # precompute molecular orbitals
+            if occ_coeff is not None:
+                occ_coeff_a_mask = occ_coeff_a[mask]
+                occ_coeff_b_mask = occ_coeff_b[mask]
+                rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
+                                xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0)
+                rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
+                                xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0)
+            else: # slow version
+                rho1a = []
+                rho1b = []
+                for i in range(nset):
+                    rho_tmp = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
+                                       xctype=xctype, hermi=hermi)
+                    rho1a.append(rho_tmp.reshape(-1,p1-p0))
+                    rho_tmp = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
+                                       xctype=xctype, hermi=hermi)
+                    rho1b.append(rho_tmp.reshape(-1,p1-p0))
+            t0 = log.timer_debug1('rho', *t0)
+
+            for i in range(nset):
+                wv_a = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,0])
+                wv_a+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,0])
+                wv_b = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,1])
+                wv_b+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,1])
+                if xctype == 'LDA':
+                    va = ao.dot(_scale_ao(ao, wv_a[0]).T)
+                    vb = ao.dot(_scale_ao(ao, wv_b[0]).T)
+                elif xctype == 'GGA':
+                    wv_a[0] *= .5 # for transpose_sum at the end
+                    wv_b[0] *= .5
+                    va = ao[0].dot(_scale_ao(ao, wv_a).T)
+                    vb = ao[0].dot(_scale_ao(ao, wv_b).T)
+                elif xctype == 'NLC':
+                    raise NotImplementedError('NLC')
+                else:
+                    wv_a[[0,4]] *= .5 # for transpose_sum at the end
+                    wv_b[[0,4]] *= .5
+                    va = ao[0].dot(_scale_ao(ao[:4], wv_a[:4]).T)
+                    vb = ao[0].dot(_scale_ao(ao[:4], wv_b[:4]).T)
+                    va += _tau_dot(ao, ao, wv_a[4])
+                    vb += _tau_dot(ao, ao, wv_b[4])
+                add_sparse(vmata[i], va, mask)
+                add_sparse(vmatb[i], vb, mask)
+            t1 = log.timer_debug2('integration', *t1)
+        t0 = log.timer_debug1('vxc', *t0)
+    return vmata, vmatb
 
 def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
@@ -1144,13 +1250,13 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     if opt is None or mol not in [opt.mol, opt._sorted_mol]:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
-    mol = None
-    _sorted_mol = opt._sorted_mol
+    
     nao, nao0 = opt.coeff.shape
     dma, dmb = dms
     dm_shape = dma.shape
     # AO basis -> gdftopt AO basis
     with_mocc = hasattr(dms, 'mo1')
+    mo1 = occ_coeff = None
     if with_mocc:
         mo1a, mo1b = dms.mo1
         occ_coeffa, occ_coeffb = dms.occ_coeff
@@ -1158,70 +1264,32 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         mo1b = opt.sort_orbitals(mo1b, axis=[1])
         occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0])
         occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0])
-
+        occ_coeff = (occ_coeff_a, occ_coeff_b)
+        mo1 = (mo1a, mo1b)
     dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
     dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
     dma = opt.sort_orbitals(dma, axis=[1,2])
     dmb = opt.sort_orbitals(dmb, axis=[1,2])
 
-    nset = len(dma)
-    vmata = cupy.zeros((nset, nao, nao))
-    vmatb = cupy.zeros((nset, nao, nao))
-
-    if xctype == 'LDA':
-        ao_deriv = 0
-        nvar = 1
-    elif xctype == 'GGA':
-        ao_deriv = 1
-        nvar = 4
-    else:
-        ao_deriv = 1
-        nvar = 5
-    p0 = p1 = 0
-    for ao, mask, weights, coords in ni.block_loop(
-            _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
-        t0 = log.init_timer()
-        p0, p1 = p1, p1+len(weights)
-        # precompute fxc_w
-        fxc_w = fxc[:,:,:,:,p0:p1] * weights
-
-        # precompute molecular orbitals
-        if with_mocc:
-            occ_coeff_a_mask = occ_coeff_a[mask]
-            occ_coeff_b_mask = occ_coeff_b[mask]
-            rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
-                              xctype=xctype, hermi=hermi)
-            rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
-                              xctype=xctype, hermi=hermi)
-            rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0)
-        else: # slow version
-            rho1 = cupy.empty((2, nset, nvar, p1-p0))
-            for i in range(nset):
-                rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
-                                     xctype=xctype, hermi=hermi)
-                rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
-                                     xctype=xctype, hermi=hermi)
-        t0 = log.timer_debug1('rho', *t0)
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _nr_uks_fxc_task,
+                ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff,
+                verbose=log.verbose, hermi=hermi, device_id=device_id)
+            futures.append(future)
+    vmata_dist = [] 
+    vmatb_dist = []
+    for future in futures:
+        vmata, vmatb = future.result()
+        vmata_dist.append(vmata)
+        vmatb_dist.append(vmatb)
+    
+    vmata = reduce_to_device(vmata_dist, inplace=True)
+    vmatb = reduce_to_device(vmatb_dist, inplace=True)
 
-        for i in range(nset):
-            wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
-            if xctype == 'LDA':
-                va = ao.dot(_scale_ao(ao, wv[0,0]).T)
-                vb = ao.dot(_scale_ao(ao, wv[1,0]).T)
-            elif xctype == 'GGA':
-                wv[:,0] *= .5 # for transpose_sum at the end
-                va = ao[0].dot(_scale_ao(ao, wv[0]).T)
-                vb = ao[0].dot(_scale_ao(ao, wv[1]).T)
-            elif xctype == 'NLC':
-                raise NotImplementedError('NLC')
-            else:
-                wv[:,[0,4]] *= .5 # for transpose_sum at the end
-                va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T)
-                vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T)
-                va += _tau_dot(ao, ao, wv[0,4])
-                vb += _tau_dot(ao, ao, wv[1,4])
-            add_sparse(vmata[i], va, mask)
-            add_sparse(vmatb[i], vb, mask)
     vmata = opt.unsort_orbitals(vmata, axis=[1,2])
     vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
     if xctype != 'LDA':
@@ -1578,7 +1646,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 
     comp = (deriv+1)*(deriv+2)*(deriv+3)//6
     if blksize is None:
-        #cupy.get_default_memory_pool().free_all_blocks()
+        # By default, a memory space of [comp,nao,blksize] is reserved
         mem_avail = get_avail_mem()
         blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
@@ -1737,6 +1805,9 @@ class NumInt(lib.StreamObject, LibXCMixin):
     screen_index = None
     xcfuns       = None        # can be multiple xc functionals
 
+    __getstate__, __setstate__ = lib.generate_pickle_methods(
+        excludes=('gdftopt',))
+
     def build(self, mol, coords):
         self.gdftopt = _GDFTOpt.from_mol(mol)
         self.grid_blksize = None
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index d512caa5..496abfa3 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # modified by Xiaojie Wu (wxj6000@gmail.com)
+
 import cupy
 from pyscf.dft import rks
-
 from gpu4pyscf.lib import logger
 from gpu4pyscf.dft import numint, gen_grid
 from gpu4pyscf.scf import hf
@@ -257,6 +257,7 @@ def __init__(self, xc='LDA,VWN'):
 ##################################################
 # don't modify the following attributes, they are not input options
         self._numint = numint.NumInt()
+
     @property
     def omega(self):
         return self._numint.omega
@@ -291,8 +292,13 @@ def reset(self, mol=None):
         hf.SCF.reset(self, mol)
         self.grids.reset(mol)
         self.nlcgrids.reset(mol)
-        self.cphf_grids.reset(mol)
         self._numint.reset()
+        # The cphf_grids attribute is not available in the PySCF CPU version.
+        # In PySCF's to_gpu() function, this attribute is not properly
+        # initialized. mol of the KS object must be used for initialization.
+        if mol is None:
+            mol = self.mol
+        self.cphf_grids.reset(mol)
         return self
 
     def nuc_grad_method(self):
diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py
index 99df03ce..c13dba13 100644
--- a/gpu4pyscf/dft/tests/test_libxc.py
+++ b/gpu4pyscf/dft/tests/test_libxc.py
@@ -92,7 +92,7 @@ def test_u_LDA(self):
 
     def test_u_GGA(self):
         # large errors found in B88 for the spin polarized case
-        self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-3)
+        self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-2)
         self._check_xc('GGA_X_B88', spin=1, fxc_tol=1e-1)
         self._check_xc('GGA_C_PBE', spin=1, fxc_tol=1e-4)
 
diff --git a/gpu4pyscf/dft/tests/test_rks.py b/gpu4pyscf/dft/tests/test_rks.py
index d1bf278d..4bae05ca 100644
--- a/gpu4pyscf/dft/tests/test_rks.py
+++ b/gpu4pyscf/dft/tests/test_rks.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pickle
 import numpy as np
 import unittest
 import pyscf
@@ -64,11 +65,18 @@ class KnownValues(unittest.TestCase):
     '''
     def test_rks_lda(self):
         print('------- LDA ----------------')
-        e_tot = run_dft("LDA, vwn5", mol_sph)
+        mf = mol_sph.RKS(xc='LDA,vwn5').to_gpu()
+        mf.grids.level = grids_level
+        mf.nlcgrids.level = nlcgrids_level
+        e_tot = mf.kernel()
         e_ref = -75.9046410402
         print('| CPU - GPU |:', e_tot - e_ref)
         assert np.abs(e_tot - e_ref) < 1e-5
 
+        # test serialization
+        mf1 = pickle.loads(pickle.dumps(mf))
+        assert mf1.e_tot == e_tot
+
     def test_rks_pbe(self):
         print('------- PBE ----------------')
         e_tot = run_dft('PBE', mol_sph)
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index c3390e95..dd374cc3 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -30,6 +30,7 @@
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import (
     LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
     _make_tril_tile_mappings, _nearest_power2)
@@ -79,43 +80,41 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
     return ejk, kern_counts, timing_counter
 
 def _jk_energy_per_atom(mol, dm, vhfopt=None,
@@ -126,7 +125,11 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
     log = logger.new_logger(mol, verbose)
     cput0 = log.init_timer()
     if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1: 
+            group_size = jk.GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
@@ -145,7 +148,12 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
     assert uniq_l.max() <= LMAX
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index 36b45374..8e6ce88c 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -15,7 +15,7 @@
 import ctypes
 import cupy as cp
 import numpy as np
-
+from pyscf import lib
 from pyscf.scf import _vhf
 from pyscf.gto import ATOM_OF
 from pyscf.lib import c_null_ptr
@@ -161,7 +161,6 @@ def get_n_hermite_density_of_angular_pair(l):
 
     def sort_orbitals(self, mat, axis=[]):
         ''' Transform given axis of a matrix into sorted AO,
-        and transform given auxiliary axis of a matrix into sorted auxiliary AO
         '''
         idx = self._ao_idx
         shape_ones = (1,) * mat.ndim
@@ -176,6 +175,24 @@ def sort_orbitals(self, mat, axis=[]):
             fancy_index.append(indices.reshape(idx_shape))
         return mat[tuple(fancy_index)]
 
+    def unsort_orbitals(self, sorted_mat, axis=[]):
+        ''' Transform given axis of a matrix into sorted AO,
+        '''
+        idx = self._ao_idx
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        mat = cp.empty_like(sorted_mat)
+        mat[tuple(fancy_index)] = sorted_mat
+        return mat
+
     @property
     def bpcache(self):
         device_id = cp.cuda.Device().id
@@ -205,17 +222,17 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
                         "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory")
     ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
 
-    int3c_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * np.array([1.0]).nbytes)
-    int3c = np.frombuffer(int3c_pinned_memory_pool, np.float64, ngrids * nao * nao).reshape([ngrids, nao, nao], order='C')
+    buf_size = ngrids * nao * nao
+    int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
+    int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C')
     # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time
 
     grids = cp.asarray(grids, order='C')
     if charge_exponents is not None:
         charge_exponents = cp.asarray(charge_exponents, order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
-        int3c_grid_slice = cp.zeros([ngrids_of_split, nao, nao], order='C')
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+        int3c_grid_slice = cp.zeros([p1-p0, nao, nao], order='C')
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
             cpj = intopt.cp_jdx[cp_ij_id]
@@ -237,18 +254,19 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
             ao_offsets = np.array([i0, j0], dtype=np.int32)
             strides = np.array([ni, ni*nj], dtype=np.int32)
 
-            int3c_angular_slice = cp.zeros([ngrids_of_split, j1-j0, i1-i0], order='C')
+            int3c_angular_slice = cp.zeros([p1-p0, j1-j0, i1-i0], order='C')
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
-
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
+            grids_slice = grids[p0:p1]
             err = libgint.GINTfill_int3c1e(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
                 strides.ctypes.data_as(ctypes.c_void_p),
                 ao_offsets.ctypes.data_as(ctypes.c_void_p),
@@ -270,11 +288,11 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
 
         row, col = np.tril_indices(nao)
         int3c_grid_slice[:, row, col] = int3c_grid_slice[:, col, row]
-        ao_idx = np.argsort(intopt._ao_idx)
-        grid_idx = np.arange(ngrids_of_split)
-        int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)]
-
-        int3c_grid_slice.get(out = int3c[i_grid_split : i_grid_split + ngrids_of_split, :, :])
+        #ao_idx = np.argsort(intopt._ao_idx)
+        #grid_idx = np.arange(p1-p0)
+        #int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)]
+        int3c_grid_slice = intopt.unsort_orbitals(int3c_grid_slice, axis=[1,2])
+        int3c_grid_slice.get(out = int3c[p0:p1, :, :])
 
     return int3c
 
@@ -355,9 +373,9 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
 
     row, col = np.tril_indices(nao)
     int1e_charge_contracted[row, col] = int1e_charge_contracted[col, row]
-    ao_idx = np.argsort(intopt._ao_idx)
-    int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)]
-
+    #ao_idx = np.argsort(intopt._ao_idx)
+    #int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)]
+    int1e_charge_contracted = intopt.unsort_orbitals(int1e_charge_contracted, axis=[0,1])
     return int1e_charge_contracted
 
 def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
@@ -385,7 +403,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
     bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
 
     n_total_hermite_density = intopt.density_offset[-1]
-    dm_pair_ordered = np.zeros(n_total_hermite_density)
+    dm_pair_ordered = np.empty(n_total_hermite_density)
     libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
                                               dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
                                               ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
@@ -413,8 +431,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
 
     int3c_density_contracted = cp.zeros(ngrids)
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             stream = cp.cuda.get_current_stream()
 
@@ -425,21 +442,22 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
             n_pair_sum_per_thread = nao_cart
-
+            grids_slice = grids[p0:p1, :]
             err = libgint.GINTfill_int3c1e_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
                 intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
-                ctypes.cast(int3c_density_contracted[i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p),
+                ctypes.cast(int3c_density_contracted[p0:p1].data.ptr, ctypes.c_void_p),
                 bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
                 ctypes.c_int(nbins),
                 ctypes.c_int(cp_ij_id),
diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py
index cc53feab..8b47adce 100644
--- a/gpu4pyscf/gto/int3c1e_ip.py
+++ b/gpu4pyscf/gto/int3c1e_ip.py
@@ -15,7 +15,7 @@
 import ctypes
 import cupy as cp
 import numpy as np
-
+from pyscf import lib
 from pyscf.gto import ATOM_OF
 from pyscf.lib import c_null_ptr
 from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem
@@ -40,19 +40,19 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
                         "the 3 center integral first derivative, "
                         "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory")
     ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
-
-    int3cip1_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes)
-    int3c_ip1 = np.frombuffer(int3cip1_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C')
-    int3cip2_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes)
-    int3c_ip2 = np.frombuffer(int3cip2_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C')
+    
+    buf_size = ngrids * nao * nao * 3
+    int3cip1_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
+    int3c_ip1 = np.frombuffer(int3cip1_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C')
+    int3cip2_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
+    int3c_ip2 = np.frombuffer(int3cip2_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C')
 
     grids = cp.asarray(grids, order='C')
     if charge_exponents is not None:
         charge_exponents = cp.asarray(charge_exponents, order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
-        int3c_grid_slice = cp.zeros([6, ngrids_of_split, nao, nao], order='C')
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+        int3c_grid_slice = cp.zeros([6, p1-p0, nao, nao], order='C')
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
             cpj = intopt.cp_jdx[cp_ij_id]
@@ -74,18 +74,20 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
             ao_offsets = np.array([i0, j0], dtype=np.int32)
             strides = np.array([ni, ni*nj], dtype=np.int32)
 
-            int3c_angular_slice = cp.zeros([6, ngrids_of_split, j1-j0, i1-i0], order='C')
+            int3c_angular_slice = cp.zeros([6, p1-p0, j1-j0, i1-i0], order='C')
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
 
+            grids_slice = grids[p0:p1, :]
             err = libgint.GINTfill_int3c1e_ip(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
                 strides.ctypes.data_as(ctypes.c_void_p),
                 ao_offsets.ctypes.data_as(ctypes.c_void_p),
@@ -103,20 +105,20 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
                 int3c_angular_slice = cart2sph(int3c_angular_slice, axis=2, ang=lj)
                 int3c_angular_slice = cart2sph(int3c_angular_slice, axis=3, ang=li)
 
-            int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice
+            int3c_grid_slice[:, :, i0:i1, j0:j1] = int3c_angular_slice.transpose(0,1,3,2)
 
         ao_idx = np.argsort(intopt._ao_idx)
-        grid_idx = np.arange(ngrids_of_split)
+        grid_idx = np.arange(p1-p0)
         derivative_idx = np.arange(6)
         int3c_grid_slice = int3c_grid_slice[np.ix_(derivative_idx, grid_idx, ao_idx, ao_idx)]
 
         # Each piece of the following memory is contiguous
-        int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, i_grid_split : i_grid_split + ngrids_of_split, :, :])
+        int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, p0:p1, :, :])
+        int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, p0:p1, :, :])
+        int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, p0:p1, :, :])
+        int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, p0:p1, :, :])
+        int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, p0:p1, :, :])
+        int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, p0:p1, :, :])
 
     return int3c_ip1, int3c_ip2
 
@@ -134,7 +136,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int
     charges = charges.reshape([-1, 1], order='C')
     grids = cp.concatenate([grids, charges], axis=1)
 
-    int1e_charge_contracted = cp.zeros([3, mol.nao, mol.nao], order='C')
+    int1e_charge_contracted = cp.empty([3, mol.nao, mol.nao], order='C')
     for cp_ij_id, _ in enumerate(intopt.log_qs):
         cpi = intopt.cp_idx[cp_ij_id]
         cpj = intopt.cp_jdx[cp_ij_id]
@@ -191,13 +193,68 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int
             int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=lj)
             int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=li)
 
-        int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice
+        int1e_charge_contracted[:, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,2,1)
+
+    return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2])
+
+def get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    ngrids = grids.shape[0]
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    dm = cp.asarray(dm)
+    assert dm.ndim == 2
+    assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
+
+    dm = intopt.sort_orbitals(dm, [0,1])
+    if not mol.cart:
+        cart2sph_transformation_matrix = intopt.cart2sph
+        # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
+        dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T
+    dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function
+
+    nao = intopt._sorted_mol.nao
+
+    i_atom_of_each_shell = intopt._sorted_mol._bas[:, ATOM_OF]
+    i_atom_of_each_shell = cp.array(i_atom_of_each_shell, dtype=np.int32)
+
+    ip1_per_atom = cp.zeros([mol.natm, 3, ngrids])
+
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
 
-    ao_idx = np.argsort(intopt._ao_idx)
-    derivative_idx = np.arange(3)
-    int1e_charge_contracted = int1e_charge_contracted[np.ix_(derivative_idx, ao_idx, ao_idx)]
+        err = libgint.GINTfill_int3c1e_ip1_density_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(ip1_per_atom.data.ptr, ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.cast(dm.data.ptr, ctypes.c_void_p),
+            ctypes.cast(i_atom_of_each_shell.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nao),
+            ctypes.c_double(omega))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
 
-    return int1e_charge_contracted
+    return ip1_per_atom
 
 def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
     omega = mol.omega
@@ -228,10 +285,11 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
     bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
 
     n_total_hermite_density = intopt.density_offset[-1]
-    dm_pair_ordered = np.zeros(n_total_hermite_density)
+    dm_pair_ordered = np.empty(n_total_hermite_density)
     libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
                                               dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
-                                              ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
+                                              ctypes.c_int(1), ctypes.c_int(nao_cart), 
+                                              ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
                                               intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
                                               intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
                                               l_ij.ctypes.data_as(ctypes.c_void_p),
@@ -252,8 +310,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
     int3c_density_contracted = cp.zeros([3, ngrids], order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             stream = cp.cuda.get_current_stream()
 
@@ -264,7 +321,9 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
+            grids_slice = grids[p0:p1]
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
@@ -273,12 +332,12 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
             err = libgint.GINTfill_int3c1e_ip2_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
                 intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
-                ctypes.cast(int3c_density_contracted[:, i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p),
+                ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p),
                 bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
                 ctypes.c_int(nbins),
                 ctypes.c_int(cp_ij_id),
@@ -290,6 +349,82 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
     return int3c_density_contracted
 
+def get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    ngrids = grids.shape[0]
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+    charges = cp.asarray(charges).astype(np.float64)
+
+    charges = charges.reshape([-1, 1], order='C')
+    grids = cp.concatenate([grids, charges], axis=1)
+
+    n_atom = len(gridslice)
+    i_atom_of_each_charge = [[i_atom] * (gridslice[i_atom][1] - gridslice[i_atom][0]) for i_atom in range(n_atom)]
+    i_atom_of_each_charge = sum(i_atom_of_each_charge, [])
+    i_atom_of_each_charge = cp.array(i_atom_of_each_charge, dtype=np.int32)
+
+    assert isinstance(output, cp.ndarray)
+    assert output.shape == (n_atom, 3, mol.nao, mol.nao)
+
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        cpi = intopt.cp_idx[cp_ij_id]
+        cpj = intopt.cp_jdx[cp_ij_id]
+        li = intopt.angular[cpi]
+        lj = intopt.angular[cpj]
+
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+        j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+        ni = i1 - i0
+        nj = j1 - j0
+
+        ao_offsets = np.array([i0, j0], dtype=np.int32)
+        strides = np.array([ni, ni*nj], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        int1e_angular_slice = cp.zeros([n_atom, 3, j1-j0, i1-i0], order='C')
+
+        err = libgint.GINTfill_int3c1e_ip2_charge_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.cast(i_atom_of_each_charge.data.ptr, ctypes.c_void_p),
+            ctypes.c_double(omega))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+        i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+        j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+        if not mol.cart:
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+        output[np.ix_(range(n_atom), range(3), intopt._ao_idx[i0:i1], intopt._ao_idx[j0:j1])] += int1e_angular_slice.transpose(0,1,3,2)
+
 def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt):
     dm = cp.asarray(dm)
     if dm.ndim == 3:
@@ -302,7 +437,7 @@ def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents,
     assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
 
     int3c_ip1 = get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
-    int3c_ip1 = cp.einsum('xji,ij->xi', int3c_ip1, dm)
+    int3c_ip1 = cp.einsum('xij,ij->xi', int3c_ip1, dm)
     return int3c_ip1
 
 def get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt):
@@ -319,13 +454,18 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di
     $$\left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$.
 
-    If charges is not None, the function computes the following contraction:
+    If charges is not None and density is None, the function computes the following contraction:
     $$\sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $q_C$ is the charge centered at $\vec{C}$.
 
     If charges is not None and dm is not None, the function computes the following contraction:
     $$\sum_\nu^{n_{ao}} D_{\mu\nu} \sum_{C}^{n_{charge}} q_C
         \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+
+    If dm is not None and charges is None, the function computes the following contraction:
+    $$\sum_{\mu \in \{\text{AO of atom A}\}} \sum_\nu^{n_{ao}} D_{\mu\nu}
+        \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    The output dimension is $(n_{atom}, 3, n_{charge})$.
     '''
     assert grids is not None
 
@@ -340,12 +480,14 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di
 
     if dm is None and charges is None:
         return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[0]
-    else:
-        assert charges is not None
+    elif charges is not None:
         if dm is not None:
             return get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt)
         else:
             return get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+    else:
+        assert dm is not None
+        return get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
 def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
     r'''
@@ -353,12 +495,16 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di
     $$\left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$.
 
-    If dm is not None, the function computes the following contraction:
+    If dm is not None and charges is None, the function computes the following contraction:
     $$\sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
 
     If dm is not None and charges is not None, the function computes the following contraction:
     $$q_C \sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $q_C$ is the charge centered at $\vec{C}$.
+
+    If charges is not None and dm is None, the function computes the following contraction:
+    $$\sum_{C}^{n_{charge}} q_C \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    Notice that this summation should not be performed if the charges originates from different atomic centers.
     '''
     assert grids is not None
 
@@ -373,9 +519,36 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di
 
     if dm is None and charges is None:
         return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[1]
-    else:
-        assert dm is not None
+    elif dm is not None:
         if charges is not None:
             return get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt)
         else:
             return get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
+    else:
+        assert charges is not None
+        output = cp.zeros([1, 3, mol.nao, mol.nao])
+        get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, [[0, grids.shape[0]]], output, intopt)
+        return output.reshape([3, mol.nao, mol.nao])
+
+def int1e_grids_ip2_charge_contracted(mol, grids, charges, gridslice, output, charge_exponents=None, direct_scf_tol=1e-13, intopt=None):
+    r'''
+    This function computes the following contraction:
+    $$\sum_{C \in \{\text{grid attached to atom A}\}} q_C
+        \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    where $q_C$ is the charge centered at $\vec{C}$. The output dimension is $(n_{atom}, 3, n_{ao}, n_{ao})$.
+    '''
+    assert grids is not None
+    assert charges is not None
+    assert gridslice is not None
+    assert output is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt)
diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py
deleted file mode 100644
index f386aed2..00000000
--- a/gpu4pyscf/gto/moleintor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import cupy as cp
-import numpy as np
-
-from gpu4pyscf.gto.int3c1e import VHFOpt, get_int3c1e, get_int3c1e_density_contracted, get_int3c1e_charge_contracted
-from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip, get_int3c1e_ip_contracted
-
-def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
-    assert grids is not None
-
-    if intopt is None:
-        intopt = VHFOpt(mol)
-        aosym = False if 'ip' in intor else True
-        intopt.build(direct_scf_tol, aosym=aosym)
-    else:
-        assert isinstance(intopt, VHFOpt), \
-            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
-        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
-
-    if intor == 'int1e_grids':
-        assert dm is None or charges is None, \
-            "Are you sure you want to contract the one electron integrals with both charge and density? " + \
-            "If so, pass in density, obtain the result with n_charge and contract with the charges yourself."
-        assert intopt.aosym
-
-        if dm is None and charges is None:
-            return get_int3c1e(mol, grids, charge_exponents, intopt)
-        elif dm is not None:
-            return get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt)
-        elif charges is not None:
-            return get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
-        else:
-            raise ValueError(f"Logic error in {__file__} {__name__}")
-    elif intor == 'int1e_grids_ip':
-        assert not intopt.aosym
-
-        if dm is None and charges is None:
-            return get_int3c1e_ip(mol, grids, charge_exponents, intopt)
-        else:
-            assert dm is not None
-            assert charges is not None
-            return get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt)
-    else:
-        raise NotImplementedError(f"GPU intor {intor} is not implemented.")
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
index e77f30ec..56f87e4b 100644
--- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
@@ -18,7 +18,7 @@
 import cupy as cp
 import pyscf
 from pyscf import lib, gto, df
-from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2, int1e_grids_ip2_charge_contracted
 
 def setUpModule():
     global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold
@@ -74,8 +74,8 @@ def test_int1e_grids_ip_full_tensor_cart(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -94,8 +94,8 @@ def test_int1e_grids_ip_full_tensor_sph(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -117,8 +117,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -141,8 +141,8 @@ def test_int1e_grids_ip_full_tensor_omega(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -168,8 +168,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge_omega(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -314,6 +314,55 @@ def test_int1e_grids_ip_contracted_gaussian_charge_omega(self):
         cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
 
+    def test_int1e_grids_ip2_charge_contracted(self):
+        np.random.seed(12346)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip2 = mol._add_suffix('int3c2e_ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip2)
+        q_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip2, aosym='s1', cintopt=cintopt)
+
+        ngrids = grid_points.shape[0]
+        n_atom = mol.natm
+        nao = mol.nao
+        gridslice = [[ngrids * i // n_atom, ngrids * (i + 1) // n_atom] for i in range(n_atom)]
+        ref_int1e_dC = np.zeros([n_atom, 3, nao, nao])
+        for i_atom in range(n_atom):
+            g0,g1 = gridslice[i_atom]
+            ref_int1e_dC[i_atom, :, :, :] += np.einsum('dijq,q->dij', q_nj[:, :, :, g0:g1], charges[g0:g1])
+
+        test_int1e_dC = cp.zeros([n_atom, 3, nao, nao])
+        int1e_grids_ip2_charge_contracted(mol, grid_points, charges, gridslice, test_int1e_dC)
+
+        cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1_density_contracted(self):
+        np.random.seed(12347)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip1 = mol._add_suffix('int3c2e_ip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1, aosym='s1', cintopt=cintopt)
+
+        v_nj = np.einsum('dijq,ij->dqi', v_nj, dm)
+
+        ngrids = grid_points.shape[0]
+        aoslice = np.array(mol.aoslice_by_atom())
+        ref_int1e_dA = np.empty([mol.natm, 3, ngrids])
+        for i_atom in range(mol.natm):
+            p0,p1 = aoslice[i_atom, 2:]
+            ref_int1e_dA[i_atom,:,:] = np.einsum('dqi->dq', v_nj[:,:,p0:p1])
+
+        test_int1e_dA = int1e_grids_ip1(mol, grid_points, dm = dm)
+
+        cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
+
 if __name__ == "__main__":
     print("Full Tests for One Electron Coulomb Integrals")
     unittest.main()
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
new file mode 100644
index 00000000..65edff6b
--- /dev/null
+++ b/gpu4pyscf/hessian/jk.py
@@ -0,0 +1,305 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+'''
+Compute J/K matrices for Hessian
+'''
+import ctypes
+import math
+import numpy as np
+import cupy as cp
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+
+from pyscf import lib
+from pyscf.scf import _vhf
+from pyscf import __config__
+from gpu4pyscf.scf import jk
+from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH,
+                              _VHFOpt, LMAX, init_constant, libvhf_rys)
+from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum,
+                                       reduce_to_device, contract)
+
+from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.lib import logger
+
+
+def _ao2mo(v_ao, mocc, mo_coeff):
+    v_ao = contract('nij,jo->nio', v_ao, mocc)
+    return contract('nio,ip->npo', v_ao, mo_coeff)
+
+def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
+             device_id=0, with_j=True, with_k=True, verbose=0):
+    nao, _ = vhfopt.coeff.shape
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_ctr_bas_loc = vhfopt.l_ctr_offsets
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    kern = libvhf_rys.RYS_build_jk
+
+    timing_counter = Counter()
+    kern_counts = 0
+    with cp.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(mol, verbose)
+        cput0 = log.init_timer()
+        dms = cp.asarray(dms)
+        coeff = cp.asarray(vhfopt.coeff)
+
+        # Transform MO coeffcients and DM into sorted, cartesian AO basis
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, coeff.T)
+        dms = cp.asarray(dms, order='C')
+
+        n_dm = dms.shape[0]
+        tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
+        q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
+        s_ptr = lib.c_null_ptr()
+        if mol.omega < 0:
+            s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p)
+
+        vj = vk = None
+        vj_ptr = vk_ptr = lib.c_null_ptr()
+        assert with_j or with_k
+        if with_k:
+            vk = cp.zeros(dms.shape)
+            vk_ptr = ctypes.cast(vk.data.ptr, ctypes.c_void_p)
+        if with_j:
+            vj = cp.zeros(dms.shape)
+            vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)
+
+        ao_loc = mol.ao_loc
+        dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
+        log_max_dm = dm_cond.max()
+        log_cutoff = math.log(vhfopt.direct_scf_tol)
+        tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond,
+                                                 log_cutoff-log_max_dm)
+        workers = gpu_specs['multiProcessorCount']
+        pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+        info = cp.empty(2, dtype=np.uint32)
+        t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
+
+        for i, j, k, l in task_list:
+            ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
+                       l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+            tile_ij_mapping = tile_mappings[i,j]
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
+        if with_j:
+            vj *= 2.0
+            vj = transpose_sum(vj)
+        if with_k:
+            vk = transpose_sum(vk)
+
+        assert mo_coeff.ndim == 2 or mo_coeff.ndim == 3
+        if mo_coeff.ndim == 3:
+            # Unrestricted case
+            mo_coeff = cp.asarray(mo_coeff)
+            mo_occ = cp.asarray(mo_occ)
+            moa = coeff.dot(mo_coeff[0])
+            mob = coeff.dot(mo_coeff[1])
+            nmoa, nmob = moa.shape[1], mob.shape[1]
+            mocca = moa[:,mo_occ[0] > 0.5]
+            moccb = mob[:,mo_occ[1] > 0.5]
+            nocca, noccb = mocca.shape[1], moccb.shape[1]
+            n_dm_2 = n_dm//2
+            if with_j:
+                vjab = vj[:n_dm_2] + vj[n_dm_2:]
+                vj = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vj[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+            if with_k:
+                vka, vkb = vk[:n_dm_2], vk[n_dm_2:]
+                vk = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
+                vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
+        else:
+            mo_coeff = cp.asarray(mo_coeff)
+            mo_occ = cp.asarray(mo_occ)
+            mo_coeff = coeff.dot(mo_coeff)
+            mocc = mo_coeff[:,mo_occ>0.5]
+            if with_j:
+                vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1)
+            if with_k:
+                vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1)
+
+    return vj, vk, kern_counts, timing_counter
+
+def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
+           with_j=True, with_k=True, verbose=None):
+    '''Compute J, K matrices in MO
+    '''
+    log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
+    assert hermi == 1
+    if vhfopt is None:
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1:
+            group_size = jk.GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
+
+    mol = vhfopt.sorted_mol
+    nao, nao_orig = vhfopt.coeff.shape
+
+    dm = cp.asarray(dm, order='C')
+    dms = dm.reshape(-1,nao_orig,nao_orig)
+    n_dm = dms.shape[0]
+
+    assert with_j or with_k
+
+    init_constant(mol)
+
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    n_groups = np.count_nonzero(uniq_l <= LMAX)
+
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
+    tasks = np.array(tasks)
+    task_list = []
+    for device_id in range(_num_devices):
+        task_list.append(tasks[device_id::_num_devices])
+
+    cp.cuda.get_current_stream().synchronize()
+    futures = []
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_task,
+                mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi,
+                with_j=with_j, with_k=with_k, verbose=verbose,
+                device_id=device_id)
+            futures.append(future)
+
+    kern_counts = 0
+    timing_collection = Counter()
+    vj_dist = []
+    vk_dist = []
+    for future in futures:
+        vj, vk, counts, counter = future.result()
+        kern_counts += counts
+        timing_collection += counter
+        vj_dist.append(vj)
+        vk_dist.append(vk)
+
+    if log.verbose >= logger.DEBUG1:
+        log.debug1('kernel launches %d', kern_counts)
+        for llll, t in timing_collection.items():
+            log.debug1('%s wall time %.2f', llll, t)
+
+    for s in _streams:
+        s.synchronize()
+    cp.cuda.get_current_stream().synchronize()
+    vj = vk = None
+    if with_k:
+        vk = reduce_to_device(vk_dist, inplace=True)
+
+    if with_j:
+        vj = reduce_to_device(vj_dist, inplace=True)
+
+    h_shls = vhfopt.h_shls
+    if h_shls:
+        cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
+        log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
+        scripts = []
+        if with_j:
+            scripts.append('ji->s2kl')
+        if with_k:
+            if hermi == 1:
+                scripts.append('jk->s2il')
+            else:
+                scripts.append('jk->s1il')
+        # Transform MO coeffcients and DM into sorted, cartesian AO basis
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, vhfopt.coeff.T)
+        dms = cp.asarray(dms, order='C')
+        shls_excludes = [0, h_shls[0]] * 4
+        vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
+                                 dms.get(), 1, mol._atm, mol._bas, mol._env,
+                                 shls_excludes=shls_excludes)
+        if with_j and with_k:
+            vj1 = vs_h[0]
+            vk1 = vs_h[1]
+        elif with_j:
+            vj1 = vs_h[0]
+        else:
+            vk1 = vs_h[0]
+
+        idx, idy = np.tril_indices(nao, -1)
+        if hermi == 1:
+            if with_j:
+                vj1[:,idy,idx] = vj1[:,idx,idy]
+            if with_k:
+                vk1[:,idy,idx] = vk1[:,idx,idy]
+
+        if mo_coeff.ndim == 3:
+            moa = vhfopt.coeff.dot(mo_coeff[0])
+            mob = vhfopt.coeff.dot(mo_coeff[1])
+            mocca = moa[:,mo_occ[0]>0.5]
+            moccb = mob[:,mo_occ[1]>0.5]
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
+            n_dm_2 = n_dm//2
+            if with_j:
+                vjab = vj1[:n_dm_2] + vj1[n_dm_2:]
+                vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+            if with_k:
+                vka, vkb = vk1[:n_dm_2], vk1[n_dm_2:]
+                vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
+                vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
+        else:
+            mo_coeff = vhfopt.coeff.dot(mo_coeff)
+            mocc = mo_coeff[:,mo_occ>0.5]
+            if with_j:
+                vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1)
+            if with_k:
+                vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff).reshape(n_dm,-1)
+        log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
+    log.timer('vj and vk', *cput0)
+    return vj, vk
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index f5291b54..775a6e98 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -25,10 +25,8 @@
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from pyscf.hessian import rhf as rhf_hess_cpu
-from pyscf import lib
+from pyscf import lib, gto
 from pyscf.gto import ATOM_OF
-# import _response_functions to load gen_response methods in SCF class
-from gpu4pyscf.scf import _response_functions  # noqa
 from gpu4pyscf.scf import cphf
 from gpu4pyscf.lib.cupy_helper import (reduce_to_device,
     contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense,
@@ -37,9 +35,10 @@
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf.jk import (
-    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
-    _make_tril_tile_mappings, _nearest_power2)
+    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, 
+    init_constant, _make_tril_tile_mappings, _nearest_power2)
 from gpu4pyscf.grad import rhf as rhf_grad
+from gpu4pyscf.hessian import jk
 
 libvhf_rys.RYS_per_atom_jk_ip2_type12.restype = ctypes.c_int
 libvhf_rys.RYS_per_atom_jk_ip2_type3.restype = ctypes.c_int
@@ -77,10 +76,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             h1mo = h1mo.get()
         t1 = log.timer_debug1('making H1', *t1)
     if mo1 is None or mo_e1 is None:
+        fx = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo,
-                                       None, atmlst, max_memory, log)
+                                       fx, atmlst, max_memory, log)
         t1 = log.timer_debug1('solving MO1', *t1)
-
     mo1 = cupy.asarray(mo1)
     # *2 for double occupancy, *2 for +c.c.
     de2 += contract('kxpi,lypi->klxy', cupy.asarray(h1mo), mo1) * 4
@@ -179,6 +178,11 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         log = logger.new_logger(mol, verbose)
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
+        coeff = cp.asarray(vhfopt.coeff)
+
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, coeff.T)
+        dms = cp.asarray(dms, order='C')
 
         tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
         q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
@@ -200,62 +204,60 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err1 = kern1(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    err2 = kern2(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err1 != 0 or err2 != 0:
-                        raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err1 = kern1(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            err2 = kern2(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err1 != 0 or err2 != 0:
+                raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
 
         ejk = ejk + ejk.transpose(1,0,3,2)
     return ejk, kern_counts, timing_counter
@@ -267,16 +269,17 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
     log = logger.new_logger(mol, verbose)
     cput0 = log.init_timer()
     if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1: 
+            group_size = GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
 
     dm = cp.asarray(dm, order='C')
     dms = dm.reshape(-1,nao_orig,nao_orig)
-    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
-    dms = sandwich_dot(dms, vhfopt.coeff.T)
-    dms = cp.asarray(dms, order='C')
 
     init_constant(mol)
 
@@ -285,7 +288,12 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
     assert uniq_l.max() <= LMAX
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
@@ -354,16 +362,18 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     assert atmlst is None
     mol = hessobj.mol
     natm = mol.natm
-    nao = mo_coeff.shape[0]
     mo_coeff = cp.asarray(mo_coeff)
     mocc = cp.asarray(mo_coeff[:,mo_occ>0])
     dm0 = mocc.dot(mocc.T) * 2
     h1mo = rhf_grad.get_grad_hcore(hessobj.base.Gradients())
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem = get_avail_mem()
-    slice_size = int(avail_mem*0.6) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vj, vk = _get_jk(mol, dm0, atoms_slice=atoms_slice, verbose=verbose)
+        vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose)
         #:vhf = vj - vk * .5
         vhf = vk
         vhf *= -.5
@@ -375,9 +385,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         vj = vk = vhf = None
     return h1mo
 
-
 def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                        device_id=0, with_j=True, with_k=True, verbose=0):
+    # TODO: compute JK in MO
     assert isinstance(verbose, int)
     nao, _ = vhfopt.coeff.shape
     natm = mol.natm
@@ -391,7 +401,6 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
     uniq_l = uniq_l_ctr[:,0]
     l_ctr_bas_loc = vhfopt.l_ctr_offsets
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    n_groups = len(uniq_l_ctr)
     kern = libvhf_rys.RYS_build_jk_ip1
 
     timing_counter = Counter()
@@ -423,7 +432,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
@@ -438,42 +447,40 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                     cp.arange(jsh0, jsh1, dtype=np.int32))
             idx = cp.argsort(sub_tile_q[mask])[::-1]
             tile_ij_mapping = t_ij[mask][idx]
-            for k in range(n_groups):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tril_tile_mappings[k,l]
-                    scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
-                        lib.c_null_ptr(),
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tril_tile_mappings[k,l]
+            scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
+                ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
+                lib.c_null_ptr(),
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
     return vj, vk, kern_counts, timing_counter
 
-def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
+def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
     r'''
     For each atom, compute
     J = ((\nabla_X i) j| kl) (D_lk + D_ji)
@@ -485,7 +492,11 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
     vhfopt = _VHFOpt(mol)
     # tile must set to 1. This tile size is assumed in the GPU kernel code
     vhfopt.tile = 1
-    vhfopt.build()
+    # Small group size for load balance
+    group_size = None
+    if _num_devices > 1: 
+        group_size = GROUP_SIZE
+    vhfopt.build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
@@ -513,7 +524,12 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
     assert vhfopt.tile_q_cond.shape == (nbas, nbas)
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(n_groups):
+            for k in range(n_groups):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
@@ -655,10 +671,10 @@ def fvind_vo(mo1):
 
     avail_mem = get_avail_mem()
     # *4 for input dm, vj, vk, and vxc
-    blksize = int(min(avail_mem*.3 / (8*3*nao*nao*4),
-                      avail_mem*.6 / (8*nmo*nocc*natm*3*5)))
+    blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO
+                      avail_mem*.3 / (8*nao*nao*3*3))) # vj, vk, dm in AO
     if blksize < ALIGNED**2:
-        raise RuntimeError('GPU memory insufficient')
+        raise RuntimeError('GPU memory insufficient for solving CPHF equations')
 
     blksize = (blksize // ALIGNED**2) * ALIGNED**2
     log.debug(f'GPU memory {avail_mem/GB:.1f} GB available')
@@ -704,78 +720,73 @@ def fvind_vo(mo1):
     log.timer('CPHF solver', *t0)
     return mo1s, e1s
 
-def gen_vind(mf, mo_coeff, mo_occ):
-    # Move data to GPU
+def gen_vind(hessobj, mo_coeff, mo_occ):
+    mol = hessobj.mol
     mo_coeff = cupy.asarray(mo_coeff)
     mo_occ = cupy.asarray(mo_occ)
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     nocc = mocc.shape[1]
     mocc_2 = mocc * 2
-    grids = getattr(mf, 'cphf_grids', None)
-    if grids is not None:
-        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmo,nocc)
         mo1_mo = contract('npo,ip->nio', mo1, mo_coeff)
-        #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2)
-        #dm1 = dm1 + dm1.transpose(0,2,1)
         dm1 = mo1_mo.dot(mocc_2.T)
-        transpose_sum(dm1)
+        dm1 = transpose_sum(dm1)
         dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ)
-        v1 = vresp(dm1)
-        tmp = contract('nij,jo->nio', v1, mocc)
-        v1vo = contract('nio,ip->npo', tmp, mo_coeff)
-        return v1vo
+        return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1)
     return fx
 
 def hess_nuc_elec(mol, dm):
     '''
     calculate hessian contribution due to (nuc, elec) pair
     '''
+    from gpu4pyscf.df import int3c2e
+    coords = mol.atom_coords()
+    charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
+
+    fakemol = gto.fakemol_for_charges(coords)
+    fakemol.output = mol.output
+    fakemol.verbose = mol.verbose
+    fakemol.stdout = mol.stdout
+    intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
+                 group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
+    dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
 
-    '''
-    nao = mol.nao
-    aoslices = mol.aoslice_by_atom()
     natm = mol.natm
-    hcore = numpy.zeros([3,3,natm,natm])
-    # CPU version
-    for ia in range(mol.natm):
-        ish0, ish1, i0, i1 = aoslices[ia]
-        zi = mol.atom_charge(ia)
-        with mol.with_rinv_at_nucleus(ia):
-            rinv2aa = mol.intor('int1e_ipiprinv', comp=9).reshape([3,3,nao,nao])
-            rinv2ab = mol.intor('int1e_iprinvip', comp=9).reshape([3,3,nao,nao])
-            rinv2aa *= zi
-            rinv2ab *= zi
-
-            hcore[:,:,ia,ia] -= numpy.einsum('xypq,pq->xy', rinv2aa+rinv2ab, dm)
-
-            haa = numpy.einsum('xypq,pq->xyp', rinv2aa, dm)
-            hab = numpy.einsum('xypq,pq->xyp', rinv2ab, dm)
-
-            haa = [haa[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]]
-            hab = [hab[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]]
-
-            haa = numpy.stack(haa, axis=2)
-            hab = numpy.stack(hab, axis=2)
-
-            hcore[:,:,ia] += haa
-            hcore[:,:,ia] += hab.transpose([1,0,2])
-
-            hcore[:,:,:,ia] += haa.transpose([1,0,2])
-            hcore[:,:,:,ia] += hab
+    nao = mol.nao
+    hcore_diag = cupy.zeros([9,natm])
+    hcore_aa = cupy.zeros([9,natm,nao])
+    for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipip1'):
+        haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
+        hcore_aa[:,k0:k1,i0:i1] += haa
+        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
+
+    hcore_ab = cupy.zeros([9,natm,nao])
+    for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipvip1'):
+        hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
+        hcore_ab[:,k0:k1,i0:i1] += hab
+        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
+
+    hcore_diag = contract('xp,p->xp', hcore_diag, charges)
+    hcore_aa = contract('xpj,p->xpj', hcore_aa, charges)
+    hcore_ab = contract('xpj,p->xpj', hcore_ab, charges)
 
-    hcore = cupy.asarray(hcore)
-    '''
-    from gpu4pyscf.df import int3c2e
-    hcore = int3c2e.get_hess_nuc_elec(mol, dm)
+    aoslices = mol.aoslice_by_atom()
+    ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
+
+    hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm])
+    hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm])
+    hcore = hcore_aa + hcore_aa.transpose([1,0,3,2])
+    hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2])
+    hcore_diag = hcore_diag.reshape([3,3,natm])
+    idx = np.arange(natm)
+    hcore[:,:,idx,idx] += hcore_diag
     return hcore * 2.0
 
-
 def kernel(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     cput0 = (logger.process_clock(), logger.perf_counter())
     if mo_energy is None: mo_energy = hessobj.base.mo_energy
@@ -832,14 +843,14 @@ def _e_hcore_generator(hessobj, dm):
     h1aa = cupy.asarray(h1aa)
     h1ab = cupy.asarray(h1ab)
 
-    hcore = cupy.empty((3,3,nao,nao))
     t1 = log.timer_debug1('get_hcore', *t1)
     def get_hcore(iatm, jatm):
-        nonlocal hcore
         ish0, ish1, i0, i1 = aoslices[iatm]
         jsh0, jsh1, j0, j1 = aoslices[jatm]
         rinv2aa = rinv2ab = None
         if iatm == jatm:
+            de = contract('xypq,pq->xy', h1aa[:,:,i0:i1], dm[i0:i1])
+            de+= contract('xypq,pq->xy', h1ab[:,:,i0:i1,i0:i1], dm[i0:i1,i0:i1])
             with mol.with_rinv_at_nucleus(iatm):
                 # The remaining integrals like int1e_ipiprinv are computed in
                 # hess_nuc_elec(mol, dm)
@@ -850,18 +861,16 @@ def get_hcore(iatm, jatm):
                     rinv2ab = cupy.asarray(rinv2ab)
                     rinv2aa = rinv2aa.reshape(3,3,nao,nao)
                     rinv2ab = rinv2ab.reshape(3,3,nao,nao)
-            hcore[:] = 0.
-            hcore[:,:,i0:i1] += h1aa[:,:,i0:i1]
-            hcore[:,:,i0:i1,i0:i1] += h1ab[:,:,i0:i1,i0:i1]
+            
             if rinv2aa is not None or rinv2ab is not None:
-                hcore -= rinv2aa + rinv2ab
+                hcore = -(rinv2aa + rinv2ab)
                 hcore[:,:,i0:i1] += rinv2aa[:,:,i0:i1]
                 hcore[:,:,i0:i1] += rinv2ab[:,:,i0:i1]
                 hcore[:,:,:,i0:i1] += rinv2aa[:,:,i0:i1].transpose(0,1,3,2)
                 hcore[:,:,:,i0:i1] += rinv2ab[:,:,:,i0:i1]
+                de += cupy.einsum('xypq,pq->xy', hcore, dm)
         else:
-            hcore[:] = 0.
-            hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1]
+            de = contract('xypq,pq->xy',h1ab[:,:,i0:i1,j0:j1],dm[i0:i1,j0:j1])
             with mol.with_rinv_at_nucleus(iatm):
                 if with_ecp and iatm in ecp_atoms:
                     shls_slice = (jsh0, jsh1, 0, nbas)
@@ -869,8 +878,9 @@ def get_hcore(iatm, jatm):
                     rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
                     rinv2aa = cupy.asarray(rinv2aa)
                     rinv2ab = cupy.asarray(rinv2ab)
-                    hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao)
-                    hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
+                    hcore = rinv2aa.reshape(3,3,j1-j0,nao)
+                    hcore+= rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
+                    de += contract('xypq,pq->xy', hcore, dm[j0:j1])
             with mol.with_rinv_at_nucleus(jatm):
                 if with_ecp and jatm in ecp_atoms:
                     shls_slice = (ish0, ish1, 0, nbas)
@@ -878,16 +888,39 @@ def get_hcore(iatm, jatm):
                     rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
                     rinv2aa = cupy.asarray(rinv2aa)
                     rinv2ab = cupy.asarray(rinv2ab)
-                    hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao)
-                    hcore[:,:,i0:i1] += rinv2ab.reshape(3,3,i1-i0,nao)
-        de = cupy.einsum('xypq,pq->xy', hcore, dm)
-        de += cupy.einsum('xyqp,pq->xy', hcore, dm)
-        return cp.asarray(de + de_nuc_elec[:,:,iatm,jatm])
+                    hcore = rinv2aa.reshape(3,3,i1-i0,nao)
+                    hcore+= rinv2ab.reshape(3,3,i1-i0,nao)
+                    de += contract('xypq,pq->xy', hcore, dm[i0:i1])
+        # 2.0* due to the symmetry
+        return cp.asarray(2.0*de + de_nuc_elec[:,:,iatm,jatm])
     return get_hcore
 
 def hcore_generator(hessobj, mol=None):
     raise NotImplementedError
 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
+            hermi=1, with_j=True, with_k=True, omega=None):
+    ''' Compute J/K matrices in MO for multiple DMs
+    '''
+    mf = hessobj.base
+    vhfopt = mf._opt_gpu.get(omega)
+    if vhfopt is None:
+        with mol.with_range_coulomb(omega):
+            # Small group size for load balance
+            group_size = None
+            if _num_devices > 1: 
+                group_size = GROUP_SIZE
+            vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size)
+            mf._opt_gpu[omega] = vhfopt
+    with mol.with_range_coulomb(omega):
+        vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k)
+    return vj, vk
+
+def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
+                     hermi=hermi, with_j=True, with_k=True, omega=omega)
+    return vj - 0.5 * vk
+
 class HessianBase(lib.StreamObject):
     # attributes
     max_cycle   = rhf_hess_cpu.HessianBase.max_cycle
@@ -899,6 +932,8 @@ class HessianBase(lib.StreamObject):
     make_h1         = rhf_hess_cpu.HessianBase.make_h1
     hcore_generator = hcore_generator  # the functionality is different from cpu version
     hess_nuc        = rhf_hess_cpu.HessianBase.hess_nuc
+    gen_vind        = NotImplemented
+    get_jk          = NotImplemented
     kernel = hess = kernel
 
     def get_hcore(self, mol=None):
@@ -950,6 +985,9 @@ def __init__(self, scf_method):
     hess_elec = hess_elec
     make_h1 = make_h1
     gen_hop = NotImplemented
+    gen_vind = gen_vind
+    get_jk_mo = _get_jk_mo
+    get_veff_resp_mo = _get_veff_resp_mo
 
 # Inject to RHF class
 from gpu4pyscf import scf
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 64c6fa4b..d506b934 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -25,12 +25,13 @@
 from pyscf import lib
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.grad import rhf as rhf_grad
-# import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse, get_avail_mem, reduce_to_device
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
+                                       reduce_to_device, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.hessian import jk
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
@@ -109,7 +110,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
     natm = mol.natm
     assert atmlst is None or atmlst == range(natm)
-    nao = mo_coeff.shape[0]
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
     avail_mem = get_avail_mem()
@@ -122,25 +122,29 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem -= 8 * h1mo.size
-    slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vj, vk = rhf_hess._get_jk(mol, dm0, with_k=with_k,
-                                  atoms_slice=atoms_slice, verbose=verbose)
+        vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k,
+                                      atoms_slice=atoms_slice, verbose=verbose)
         veff = vj
         if with_k:
             vk *= .5 * hyb
             veff -= vk
+        vj = vk = None
         if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
             with mol.with_range_coulomb(omega):
-                vk_lr = rhf_hess._get_jk(mol, dm0, with_j=False, verbose=verbose)[1]
+                vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1]
                 vk_lr *= (alpha-hyb) * .5
                 veff -= vk_lr
         atom0, atom1 = atoms_slice
         for i, ia in enumerate(range(atom0, atom1)):
             for ix in range(3):
                 h1mo[ia,ix] += mo_coeff.T.dot(veff[i,ix].dot(mocc))
-        vj = vk = vk_lr = veff = None
+        vk_lr = veff = None
     return h1mo
 
 XX, XY, XZ = 4, 5, 6
@@ -698,6 +702,166 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
     vmat = reduce_to_device(vmat_dist, inplace=True)
     return vmat
 
+def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
+                        verbose=None, hermi=1, device_id=0):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
+        if mo1 is not None: mo1 = cupy.asarray(mo1)
+        if mocc is not None: mocc = cupy.asarray(mocc)
+        if fxc is not None: fxc = cupy.asarray(fxc)
+
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
+        xctype = ni._xc_type(xc_code)
+        opt = getattr(ni, 'gdftopt', None)
+
+        _sorted_mol = opt.mol
+        nao = mol.nao
+        nset = mo1.shape[0]
+        vmat = cupy.zeros((nset, nao, nao))
+
+        if xctype == 'LDA':
+            ao_deriv = 0
+        else:
+            ao_deriv = 1
+
+        ngrids_glob = grids.coords.shape[0]
+        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        grid_start = device_id * ngrids_per_device
+        grid_end = (device_id + 1) * ngrids_per_device
+
+        p0 = p1 = grid_start
+        t1 = t0 = log.init_timer()
+        for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                       max_memory=None, blksize=None,
+                                                       grid_range=(grid_start, grid_end)):
+            p0, p1 = p1, p1+len(weights)
+            occ_coeff_mask = mocc[mask]
+            rho1 = numint.eval_rho4(_sorted_mol, ao, 2.0*occ_coeff_mask, mo1[:,mask],
+                                    xctype=xctype, hermi=hermi)
+            t1 = log.timer_debug2('eval rho', *t1)
+
+            # precompute fxc_w
+            if xctype == 'LDA':
+                fxc_w = fxc[0,0,p0:p1] * weights
+                wv = rho1 * fxc_w
+            else:
+                fxc_w = fxc[:,:,p0:p1] * weights
+                wv = contract('axg,xyg->ayg', rho1, fxc_w)
+
+            for i in range(nset):
+                if xctype == 'LDA':
+                    vmat_tmp = ao.dot(numint._scale_ao(ao, wv[i]).T)
+                elif xctype == 'GGA':
+                    wv[i,0] *= .5
+                    aow = numint._scale_ao(ao, wv[i])
+                    vmat_tmp = aow.dot(ao[0].T)
+                elif xctype == 'NLC':
+                    raise NotImplementedError('NLC')
+                else:
+                    wv[i,0] *= .5
+                    wv[i,4] *= .5
+                    vmat_tmp = ao[0].dot(numint._scale_ao(ao[:4], wv[i,:4]).T)
+                    vmat_tmp+= numint._tau_dot(ao, ao, wv[i,4])
+                add_sparse(vmat[i], vmat_tmp, mask)
+
+            t1 = log.timer_debug2('integration', *t1)
+            ao = rho1 = None
+        t0 = log.timer_debug1(f'vxc on Device {device_id} ', *t0)
+        if xctype != 'LDA':
+            transpose_sum(vmat)
+        vmat = jk._ao2mo(vmat, mocc, mo_coeff)
+    return vmat
+
+def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, relativity=0, hermi=0,
+               rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
+    if fxc is None:
+        raise RuntimeError('fxc was not initialized')
+    #xctype = ni._xc_type(xc_code)
+    opt = getattr(ni, 'gdftopt', None)
+    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+        ni.build(mol, grids.coords)
+        opt = ni.gdftopt
+
+    nao = mol.nao
+    dms = cupy.asarray(dms)
+    dm_shape = dms.shape
+    # AO basis -> gdftopt AO basis
+    with_mocc = hasattr(dms, 'mo1')
+    mo1 = mocc = None
+    if with_mocc:
+        mo1 = opt.sort_orbitals(dms.mo1, axis=[1])
+        mocc = opt.sort_orbitals(dms.occ_coeff, axis=[0])
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
+    dms = opt.sort_orbitals(dms.reshape(-1,nao,nao), axis=[1,2])
+    
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _nr_rks_fxc_mo_task,
+                ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
+                verbose=log.verbose, hermi=hermi, device_id=device_id)
+            futures.append(future)
+    dms = None
+    vmat_dist = []
+    for future in futures:
+        vmat_dist.append(future.result())
+    vmat = reduce_to_device(vmat_dist, inplace=True)
+
+    if len(dm_shape) == 2:
+        vmat = vmat[0]
+    t0 = log.timer_debug1('nr_rks_fxc', *t0)
+    return cupy.asarray(vmat)
+
+def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
+    mol = hessobj.mol
+    mf = hessobj.base
+    grids = getattr(mf, 'cphf_grids', None)
+    if grids is not None:
+        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
+    else:
+        # If cphf_grids is not defined, e.g object defined from CPU
+        grids = getattr(mf, 'grids', None)
+        logger.info(mf, 'Primary grids is used for CPHF in Hessian')
+
+    if grids and grids.coords is None:
+        grids.build(mol=mol, with_non0tab=False, sort_grids=True)
+
+    ni = mf._numint
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+    assert not mf.do_nlc()
+    hermi = 1
+
+    mocc = mo_coeff[:,mo_occ>0]
+    nocc = mocc.shape[1]
+    nao, nmo = mo_coeff.shape
+    # TODO: evaluate v1 in MO
+    rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
+                                        mo_coeff, mo_occ, 0)
+    v1 = nr_rks_fxc_mo(ni, mol, grids, mf.xc, None, dms, mo_coeff, 0, hermi,
+                                    rho0, vxc, fxc, max_memory=None)
+    v1 = v1.reshape(-1,nmo*nocc)
+    
+    if hybrid:
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
+        vk *= hyb
+        if omega > 1e-10:  # For range separated Coulomb
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi,
+                                        with_j=False, omega=omega)
+            vk_lr *= (alpha-hyb)
+            vk += vk_lr
+        v1 += vj - .5 * vk
+    else:
+        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1,
+                                with_k=False)[0]
+
+    return v1
+
 
 class Hessian(rhf_hess.HessianBase):
     '''Non-relativistic RKS hessian'''
@@ -714,6 +878,9 @@ def __init__(self, mf):
     partial_hess_elec = partial_hess_elec
     hess_elec = rhf_hess.hess_elec
     make_h1 = make_h1
+    gen_vind = rhf_hess.gen_vind
+    get_jk_mo = rhf_hess._get_jk_mo
+    get_veff_resp_mo = get_veff_resp_mo
 
 from gpu4pyscf import dft
 dft.rks.RKS.Hessian = lib.class_as_method(Hessian)
diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
index a0b07196..ac657199 100644
--- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
@@ -14,10 +14,14 @@
 
 import unittest
 import numpy as np
-from pyscf import gto, scf, lib
+import cupy
+import pyscf
+from pyscf import gto, lib
 from pyscf import grad, hessian
 from pyscf.hessian import rhf as rhf_cpu
+from gpu4pyscf import scf
 from gpu4pyscf.hessian import rhf as rhf_gpu
+from gpu4pyscf.hessian import jk
 
 def setUpModule():
     global mol
@@ -46,7 +50,7 @@ def test_hessian_rhf(self):
         assert abs(ref - e2_gpu).max() < 1e-6
 
     def test_partial_hess_elec(self):
-        mf = scf.RHF(mol)
+        mf = pyscf.scf.RHF(mol)
         mf.conv_tol = 1e-14
         mf.kernel()
         hobj = mf.Hessian()
@@ -102,8 +106,7 @@ def test_get_jk(self):
         nao = mol.nao
         mo_coeff = np.random.rand(nao, nao)
         dm = mo_coeff.dot(mo_coeff.T) * 2
-
-        vj, vk = rhf_gpu._get_jk(mol, dm)
+        vj, vk = rhf_gpu._get_jk_ip1(mol, dm)
         assert abs(lib.fp(vj.get()) -  87674.69061160382) < 1e-7
         assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7
 
@@ -139,6 +142,59 @@ def test_hessian_rhf_D3(self):
         e2_gpu = mf.Hessian().to_gpu().kernel()
         assert abs(ref - e2_gpu).max() < 1e-6
 
+    def test_jk_mix(self):
+        mol1 = pyscf.M(
+            atom='''
+        C  -1.20806619, -0.34108413, -0.00755148
+        C   1.28636081, -0.34128013, -0.00668648
+        H   2.53407081,  1.81906387, -0.00736748
+        H   1.28693681,  3.97963587, -0.00925948
+        ''',
+            basis='''unc
+        #BASIS SET:
+        H    S
+            1.815041   1
+            0.591063   1
+        H    P
+            2.305000   1
+        #BASIS SET:
+        C    S
+            8.383976   1
+            3.577015   1
+            1.547118   1
+        H    P
+            2.305000   1
+            1.098827   1
+            0.806750   1
+            0.282362   1
+        H    D
+            1.81900    1
+            0.72760    1
+            0.29104    1
+        H    F
+            0.970109   1
+        C    G
+            0.625000   1
+        C    H
+            0.4        1
+            ''',
+            output = '/dev/null'
+        )
+        nao = mol1.nao
+        mo_coeff = cupy.random.rand(nao, nao)
+        mo_occ = cupy.zeros([nao])
+        mo_occ[:3] = 2
+        mocc = mo_coeff[:,:3]
+        dm = mocc.dot(mocc.T) * 2
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1)
+        
+        mf = scf.RHF(mol1)
+        vj, vk = mf.get_jk(mol1, dm, hermi=1)
+        vj_cpu = (mo_coeff.T @ vj @ mocc).reshape(1,-1)
+        vk_cpu = (mo_coeff.T @ vk @ mocc).reshape(1,-1)
+        assert cupy.linalg.norm(vj_cpu - vj_mo) < 1e-5
+        assert cupy.linalg.norm(vk_cpu - vk_mo) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for RHF Hessian")
     unittest.main()
diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
index c4112bec..a7d5c983 100644
--- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
@@ -14,10 +14,14 @@
 
 import unittest
 import numpy
-from pyscf import gto, scf, lib
+import cupy
+import pyscf
+from pyscf import gto, lib
 from pyscf import grad, hessian
 from pyscf.hessian import uhf as uhf_cpu
+from gpu4pyscf import scf
 from gpu4pyscf.hessian import uhf as uhf_gpu
+from gpu4pyscf.hessian import jk
 
 def setUpModule():
     global mol
@@ -48,7 +52,7 @@ def test_hessian_uhf(self):
         assert abs(ref - e2_gpu).max() < 1e-6
 
     def test_partial_hess_elec(self):
-        mf = scf.UHF(mol)
+        mf = pyscf.scf.UHF(mol)
         mf.conv_tol = 1e-14
         mf.kernel()
         hobj = mf.Hessian()
@@ -73,6 +77,68 @@ def test_hessian_uhf_D3(self):
         e2_gpu = mf.Hessian().to_gpu().kernel()
         assert abs(ref - e2_gpu).max() < 1e-6
 
+    def test_jk_mix(self):
+        mol1 = pyscf.M(
+            atom='''
+        C  -1.20806619, -0.34108413, -0.00755148
+        C   1.28636081, -0.34128013, -0.00668648
+        H   2.53407081,  1.81906387, -0.00736748
+        H   1.28693681,  3.97963587, -0.00925948
+        ''',
+            basis='''unc
+        #BASIS SET:
+        H    S
+            1.815041   1
+            0.591063   1
+        H    P
+            2.305000   1
+        #BASIS SET:
+        C    S
+            8.383976   1
+            3.577015   1
+            1.547118   1
+        H    P
+            2.305000   1
+            1.098827   1
+            0.806750   1
+            0.282362   1
+        H    D
+            1.81900    1
+            0.72760    1
+            0.29104    1
+        H    F
+            0.970109   1
+        C    G
+            0.625000   1
+        C    H
+            0.4        1
+            ''',
+            output = '/dev/null'
+        )
+        nao = mol1.nao
+        mo_coeff = cupy.random.rand(2, nao, nao)
+        mocca = mo_coeff[0,:,:3]
+        moccb = mo_coeff[1,:,:2]
+        mo_occ = cupy.zeros([2,nao])
+        mo_occ[0,:3] = 1
+        mo_occ[1,:2] = 1
+        dm = cupy.empty([2,nao,nao])
+        dm[0] = mocca.dot(mocca.T)
+        dm[1] = moccb.dot(moccb.T)
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1)
+        
+        mf = scf.UHF(mol1)
+        vj, vk = mf.get_jk(mol1, dm, hermi=1)
+        vj2 = cupy.empty([5*nao])
+        vk2 = cupy.empty([5*nao])
+        vj = vj[0] + vj[1]
+        vj2[:3*nao] = (mo_coeff[0].T @ vj @ mocca).reshape(1,-1)
+        vj2[3*nao:] = (mo_coeff[1].T @ vj @ moccb).reshape(1,-1)
+        vk2[:3*nao] = (mo_coeff[0].T @ vk[0] @ mocca).reshape(1,-1)
+        vk2[3*nao:] = (mo_coeff[1].T @ vk[1] @ moccb).reshape(1,-1)
+        assert cupy.linalg.norm(vj2 - vj_mo) < 1e-5
+        assert cupy.linalg.norm(vk2 - vk_mo) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for UHF Hessian")
     unittest.main()
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index 9d389bc0..88a6c9fd 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -21,19 +21,17 @@
 Non-relativistic UHF analytical Hessian
 '''
 
-from functools import reduce
 import numpy as np
 import cupy
 import cupy as cp
 from pyscf import lib
 from pyscf.scf import ucphf
-# import _response_functions to load gen_response methods in SCF class
-from gpu4pyscf.scf import _response_functions  # noqa
-from gpu4pyscf.gto.mole import sort_atoms
-from gpu4pyscf.lib.cupy_helper import contract, tag_array, get_avail_mem, krylov
+from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem,
+                                       krylov, tag_array)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess_gpu
+from gpu4pyscf.hessian import jk
 
 GB = 1024*1024*1024
 ALIGNED = 4
@@ -67,8 +65,9 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             h1mo = (h1mo[0].get(), h1mo[1].get())
         t1 = log.timer_debug1('making H1', *t1)
     if mo1 is None or mo_e1 is None:
+        fx = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo,
-                                       None, atmlst, max_memory, log)
+                                       fx, atmlst, max_memory, log)
         t1 = log.timer_debug1('solving MO1', *t1)
 
     mo1a = cupy.asarray(mo1[0])
@@ -181,18 +180,20 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mo_a, mo_b = mo_coeff
     mocca = mo_a[:,mo_occ[0]>0]
     moccb = mo_b[:,mo_occ[1]>0]
-    nao = mo_a.shape[0]
     dm0a = mocca.dot(mocca.T)
     dm0b = moccb.dot(moccb.T)
     grad_obj = hessobj.base.Gradients()
     h1moa = rhf_grad.get_grad_hcore(grad_obj, mo_a, mo_occ[0])
     h1mob = rhf_grad.get_grad_hcore(grad_obj, mo_b, mo_occ[1])
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem = get_avail_mem()
-    slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vja, vka = rhf_hess_gpu._get_jk(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose)
-        vjb, vkb = rhf_hess_gpu._get_jk(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose)
+        vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose)
+        vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose)
         #:vhfa = vja+vjb - vka
         #:vhfb = vja+vjb - vkb
         vhfa = vka
@@ -291,8 +292,8 @@ def fvind_vo(mo1):
 
     avail_mem = get_avail_mem()
     # *8 for spin-up/down input dm, vj, vk, and vxc
-    blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8),
-                      avail_mem*.6 / (8*nmo*nocc*natm*3*5)))
+    blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*8),
+                      avail_mem*.3 / (8*nao*nao*3*6)))  # in vj, vk, dm in AO
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient')
 
@@ -368,8 +369,9 @@ def fvind_vo(mo1):
     log.timer('CPHF solver', *t0)
     return (mo1sa, mo1sb), (e1sa, e1sb)
 
-def gen_vind(mf, mo_coeff, mo_occ):
+def gen_vind(hessobj, mo_coeff, mo_occ):
     # Move data to GPU
+    mol = hessobj.mol
     mo_coeff = cupy.asarray(mo_coeff)
     mo_occ = cupy.asarray(mo_occ)
     nao, nmoa = mo_coeff[0].shape
@@ -378,39 +380,32 @@ def gen_vind(mf, mo_coeff, mo_occ):
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     nocca = mocca.shape[1]
     noccb = moccb.shape[1]
-    grids = getattr(mf, 'cphf_grids', None)
-    if grids is not None:
-        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmoa*nocca+nmob*noccb)
         nset = len(mo1)
 
+        dm1 = cupy.empty([2,nset,nao,nao])
+
         x = mo1[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
         mo1_moa = contract('npo,ip->nio', x, mo_coeff[0])
         dma = contract('nio,jo->nij', mo1_moa, mocca)
+        dm1[0] = transpose_sum(dma)
 
         x = mo1[:,nmoa*nocca:].reshape(nset,nmob,noccb)
         mo1_mob = contract('npo,ip->nio', x, mo_coeff[1])
         dmb = contract('nio,jo->nij', mo1_mob, moccb)
-
-        dm1 = cupy.empty([2,nset,nao,nao])
-        dm1[0] = dma + dma.transpose(0,2,1)
-        dm1[1] = dmb + dmb.transpose(0,2,1)
+        dm1[1] = transpose_sum(dmb)
 
         dm1 = tag_array(dm1, mo1=[mo1_moa,mo1_mob], occ_coeff=[mocca,moccb], mo_occ=mo_occ)
-        v1 = vresp(dm1)
-        v1vo = cupy.empty_like(mo1)
-        tmp = contract('nij,jo->nio', v1[0], mocca)
-        v1vo[:,:nmoa*nocca] = contract('nio,ip->npo', tmp, mo_coeff[0]).reshape(nset,-1)
-
-        tmp = contract('nij,jo->nio', v1[1], moccb)
-        v1vo[:,nmoa*nocca:] = contract('nio,ip->npo', tmp, mo_coeff[1]).reshape(nset,-1)
-        return v1vo
+        return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1)
     return fx
 
+def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
+                               hermi=hermi, with_j=True, with_k=True)
+    return vj - vk
 
 class Hessian(rhf_hess_gpu.HessianBase):
     '''Non-relativistic unrestricted Hartree-Fock hessian'''
@@ -421,6 +416,9 @@ class Hessian(rhf_hess_gpu.HessianBase):
     partial_hess_elec = partial_hess_elec
     hess_elec = hess_elec
     make_h1 = make_h1
+    gen_vind = gen_vind
+    get_jk_mo = rhf_hess_gpu._get_jk_mo
+    get_veff_resp_mo = _get_veff_resp_mo
 
     def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo,
                   fx=None, atmlst=None, max_memory=4000, verbose=None):
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index 19216a55..2a048f5f 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -23,11 +23,11 @@
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.grad import rhf as rhf_grad
-# import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d, get_avail_mem
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem)
 from gpu4pyscf.lib import logger
+from gpu4pyscf.hessian import jk
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
@@ -114,7 +114,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mo_a, mo_b = mo_coeff
     mocca = mo_a[:,mo_occ[0]>0]
     moccb = mo_b[:,mo_occ[1]>0]
-    nao = mo_a.shape[0]
     dm0a = mocca.dot(mocca.T)
     dm0b = moccb.dot(moccb.T)
     avail_mem = get_avail_mem()
@@ -129,11 +128,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem -= 8 * (h1moa.size + h1mob.size)
-    slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vja, vka = rhf_hess._get_jk(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
-        vjb, vkb = rhf_hess._get_jk(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
+        vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
+        vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
         vj = vja + vjb
         if with_k:
             #:veffa = vja + vjb - hyb * vka
@@ -150,8 +152,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         vj = vja = vjb = vka = vkb = None
         if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
             with mol.with_range_coulomb(omega):
-                vka_lr = rhf_hess._get_jk(mol, dm0a, with_j=False, verbose=verbose)[1]
-                vkb_lr = rhf_hess._get_jk(mol, dm0b, with_j=False, verbose=verbose)[1]
+                vka_lr = rhf_hess._get_jk_ip1(mol, dm0a, with_j=False, verbose=verbose)[1]
+                vkb_lr = rhf_hess._get_jk_ip1(mol, dm0b, with_j=False, verbose=verbose)[1]
                 vka_lr *= (alpha-hyb)
                 vkb_lr *= (alpha-hyb)
                 veffa -= vka_lr
@@ -842,6 +844,55 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         vmatb[ia] -= vmat_tmp
     return vmata, vmatb
 
+def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
+    mol = hessobj.mol
+    mf = hessobj.base
+    grids = getattr(mf, 'cphf_grids', None)
+    if grids is not None:
+        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
+    else:
+        # If cphf_grids is not defined, e.g object defined from CPU
+        grids = getattr(mf, 'grids', None)
+        logger.info(mf, 'Primary grids is used for CPHF in Hessian')
+
+    if grids and grids.coords is None:
+        grids.build(mol=mol, with_non0tab=False, sort_grids=True)
+
+    nao, nmoa = mo_coeff[0].shape
+    nao, nmob = mo_coeff[1].shape
+    mocca = mo_coeff[0][:,mo_occ[0]>0]
+    moccb = mo_coeff[1][:,mo_occ[1]>0]
+    nocca = mocca.shape[1]
+    noccb = moccb.shape[1]
+
+    ni = mf._numint
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+    assert not mf.do_nlc()
+    hermi = 1
+
+    rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
+                                        mo_coeff, mo_occ, 1)
+    v1 = ni.nr_uks_fxc(mol, grids, mf.xc, None, dms, 0, hermi,
+                        rho0, vxc, fxc, max_memory=None)
+    nset = dms.shape[1]
+    v1vo = cupy.empty([nset, nmoa*nocca+nmob*noccb])
+    v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca)
+    v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb)
+    if hybrid:
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
+        vk *= hyb
+        if omega > 1e-10:
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
+                                         hermi, with_j=False, omega=omega)
+            vk_lr *= (alpha-hyb)
+            vk += vk_lr
+        v1vo += vj - vk
+    else:
+        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
+                                  hermi=1, with_k=False)[0]
+    return v1vo
+
 
 class Hessian(rhf_hess.HessianBase):
     '''Non-relativistic UKS hessian'''
@@ -856,6 +907,9 @@ def __init__(self, mf):
     solve_mo1 = uhf_hess.Hessian.solve_mo1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
+    gen_vind = uhf_hess.gen_vind
+    get_jk_mo = rhf_hess._get_jk_mo
+    get_veff_resp_mo = get_veff_resp_mo
 
 from gpu4pyscf import dft
 dft.uks.UKS.Hessian = lib.class_as_method(Hessian)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 5828bbfe..d68cbcff 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -23,6 +23,7 @@
 from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
 from gpu4pyscf.lib.cusolver import eigh, cholesky  #NOQA
+from gpu4pyscf.lib.memcpy import copy_array  #NOQA
 from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access
 
 LMAX_ON_GPU = 7
@@ -87,15 +88,15 @@ def p2p_transfer(a, b):
         a[:] = b
     elif _p2p_access:
         a[:] = b
+        '''
     elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
         # cupy supports a direct copy from different devices without p2p. See also
         # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
         # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
         a[:] = b
+        '''
     else:
-        with cupy.cuda.Device(a.device):
-            # TODO: reduce memory copy, a can be non-contiguous array
-            a[:] = cupy.asarray(b.get())
+        copy_array(b, a)
 
 def concatenate(array_list):
     ''' Concatenate axis=0 only
@@ -103,15 +104,16 @@ def concatenate(array_list):
     if _p2p_access:
         return cupy.concatenate(array_list)
     else:
-        array_list_cpu = [a.get() for a in array_list]
-        n = sum([a.shape[0] for a in array_list_cpu])
-        a0_shape = list(array_list_cpu[0].shape)
+        #array_list_cpu = [a.get() for a in array_list]
+        n = sum([a.shape[0] for a in array_list])
+        a0_shape = list(array_list[0].shape)
         out_shape = tuple([n] + a0_shape[1:])
         out = cupy.empty(out_shape)
         p0 = p1 = 0
-        for a in array_list_cpu:
+        for a in array_list:
             p1 = p0 + a.shape[0]
-            out[p0:p1].set(a)
+            #out[p0:p1].set(a)
+            copy_array(a, out[p0:p1])
             p0 = p1
         return out
 
@@ -136,18 +138,19 @@ def reduce_to_device(array_list, inplace=False):
         result = array_list[0]
     else:
         result = array_list[0].copy()
+    
+    # Transfer data chunk by chunk, reduce memory footprint,
     result = result.reshape(-1)
-    # Asynchronously add each matrix from its device
     for device_id, matrix in enumerate(array_list):
         if device_id == 0:
             continue
         
         assert matrix.device.id == device_id
         matrix = matrix.reshape(-1)
-        blksize = 1024*1024*128 # 1GB
+        blksize = 1024*1024*1024 // matrix.itemsize # 1GB
         for p0, p1 in lib.prange(0,len(matrix), blksize):
-            result[p0:p1] += cupy.asarray(matrix[p0:p1])
-    
+            result[p0:p1] += copy_array(matrix[p0:p1])
+            #result[p0:p1] += cupy.asarray(matrix[p0:p1]) 
     return result.reshape(out_shape)
     
 def device2host_2d(a_cpu, a_gpu, stream=None):
diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
index d04b1b2f..1cf53f89 100644
--- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
+++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
@@ -210,6 +210,100 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, cons
     atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum);
 }
 
+__global__
+static void GINTfill_int3c1e_ip1_density_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
+                                                             const double* density, const int* aoslice, const int nao,
+                                                             const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dAx = 0;
+    double deri_dAy = 0;
+    double deri_dAz = 0;
+    for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+        const double aij = a12[ij];
+        const double eij = e12[ij];
+        const double Px  = x12[ij];
+        const double Py  = y12[ij];
+        const double Pz  = z12[ij];
+        const double PCx = Px - Cx;
+        const double PCy = Py - Cy;
+        const double PCz = Pz - Cz;
+        const double PAx = Px - Ax;
+        const double PAy = Py - Ay;
+        const double PAz = Pz - Az;
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        const double one_over_two_p = 0.5 / aij;
+        double a0 = aij;
+        const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+        const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+        a0 *= q_over_p_plus_q;
+        const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+        a0 *= theta;
+
+        const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+        const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+        if (boys_input > 3.e-7) {
+            const double sqrt_boys_input = sqrt(boys_input);
+            const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+            const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+            deri_dAx += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx);
+            deri_dAy += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy);
+            deri_dAz += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz);
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish];
+    const int j0 = ao_loc[jsh];
+
+    const double Dij = density[i0 + j0 * nao];
+    deri_dAx *= Dij;
+    deri_dAy *= Dij;
+    deri_dAz *= Dij;
+
+    const int i_atom = aoslice[ish];
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 0)), deri_dAx);
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 1)), deri_dAy);
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 2)), deri_dAz);
+}
+
 __global__
 static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
                                                              const BasisProdOffsets offsets, const int nprim_ij,
@@ -282,3 +376,85 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, con
     atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum);
     atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum);
 }
+
+__global__
+static void GINTfill_int3c1e_ip2_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
+                                                            const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice,
+                                                            const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* grid_point = grid_points + task_grid * 4;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dCx = 0;
+    double deri_dCy = 0;
+    double deri_dCz = 0;
+    for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+        const double aij = a12[ij];
+        const double eij = e12[ij];
+        const double Px  = x12[ij];
+        const double Py  = y12[ij];
+        const double Pz  = z12[ij];
+        const double PCx = Px - Cx;
+        const double PCy = Py - Cy;
+        const double PCz = Pz - Cz;
+        double a0 = aij;
+        const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+        const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+        a0 *= q_over_p_plus_q;
+        const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+        a0 *= theta;
+
+        const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+        const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+        if (boys_input > 3.e-7) {
+            const double sqrt_boys_input = sqrt(boys_input);
+            const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+            const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+            const double R100_0 = R000_1 * PCx;
+            const double R010_0 = R000_1 * PCy;
+            const double R001_0 = R000_1 * PCz;
+            deri_dCx += prefactor * R100_0;
+            deri_dCy += prefactor * R010_0;
+            deri_dCz += prefactor * R001_0;
+        }
+    }
+
+    const double charge = grid_point[3];
+    deri_dCx *= charge;
+    deri_dCy *= charge;
+    deri_dCz *= charge;
+
+    const int i_atom = gridslice[task_grid];
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx);
+    atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy);
+    atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz);
+}
diff --git a/gpu4pyscf/lib/gint/g3c1e_ip.cu b/gpu4pyscf/lib/gint/g3c1e_ip.cu
index b7524ca2..9a806bef 100644
--- a/gpu4pyscf/lib/gint/g3c1e_ip.cu
+++ b/gpu4pyscf/lib/gint/g3c1e_ip.cu
@@ -366,6 +366,104 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel_general(double* output
     }
 }
 
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ip1_density_contracted(const double* g, double* output, const double minus_two_a, const double* density, const int* aoslice, const int nao,
+                                                     const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, const int ngrids)
+{
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish];
+    const int j0 = ao_loc[jsh];
+
+    const int i_atom = aoslice[ish];
+
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 1 + 1);
+            const int gy_offset = iy + jy * (i_l + 1 + 1);
+            const int gz_offset = iz + jz * (i_l + 1 + 1);
+
+            double deri_dAx = 0;
+            double deri_dAy = 0;
+            double deri_dAz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double dgx_dAx = (ix > 0 ? ix * gx[(gx_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gx[(gx_offset + 1) * NROOTS + i_root];
+                const double dgy_dAy = (iy > 0 ? iy * gy[(gy_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gy[(gy_offset + 1) * NROOTS + i_root];
+                const double dgz_dAz = (iz > 0 ? iz * gz[(gz_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gz[(gz_offset + 1) * NROOTS + i_root];
+                deri_dAx += dgx_dAx * gy_0 * gz_0;
+                deri_dAy += gx_0 * dgy_dAy * gz_0;
+                deri_dAz += gx_0 * gy_0 * dgz_dAz;
+            }
+            const double Dij = density[(i + i0) + (j + j0) * nao];
+            deri_dAx *= Dij;
+            deri_dAy *= Dij;
+            deri_dAz *= Dij;
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 0)), deri_dAx);
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 1)), deri_dAy);
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 2)), deri_dAz);
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ip1_density_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                   const double* density, const int* aoslice, const int nao,
+                                                                   const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double g[GSIZE_INT3C_1E];
+
+    for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+        GINT_g1e<NROOTS>(g, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega);
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        GINTwrite_int3c1e_ip1_density_contracted<NROOTS>(g, output, minus_two_a, density, aoslice, nao, ish, jsh, task_grid, i_l, j_l, ngrids);
+    }
+}
+
 template <int L_SUM>
 __global__
 static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
@@ -464,3 +562,120 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* outpu
     atomicAdd(output + task_grid + ngrids * 1, deri_dCy_pair_sum);
     atomicAdd(output + task_grid + ngrids * 2, deri_dCz_pair_sum);
 }
+
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ip2_charge_contracted(const double* g, double* output, const double minus_two_a, const double* u2, const double* AC, const double prefactor,
+                                                    const int ish, const int jsh, const int i_grid, const int i_l, const int j_l,
+                                                    const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, const int ngrids)
+{
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+
+    const int i_atom = gridslice[i_grid];
+
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 1 + 1);
+            const int gy_offset = iy + jy * (i_l + 1 + 1);
+            const int gz_offset = iz + jz * (i_l + 1 + 1);
+
+            double deri_dCx = 0;
+            double deri_dCy = 0;
+            double deri_dCz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+                const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+                const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+                const double minus_two_u2 = -2.0 * u2[i_root];
+                const double dgx_dCx = minus_two_u2 * (gx_1 + AC[0] * gx_0);
+                const double dgy_dCy = minus_two_u2 * (gy_1 + AC[1] * gy_0);
+                const double dgz_dCz = minus_two_u2 * (gz_1 + AC[2] * gz_0);
+                deri_dCx += dgx_dCx * gy_0 * gz_0;
+                deri_dCy += gx_0 * dgy_dCy * gz_0;
+                deri_dCz += gx_0 * gy_0 * dgz_dCz;
+            }
+            deri_dCx *= prefactor;
+            deri_dCy *= prefactor;
+            deri_dCz *= prefactor;
+
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz);
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                  const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice,
+                                                                  const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    const double* grid_point = grid_points + task_grid * 4;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge = grid_point[3];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz };
+
+    double g[GSIZE_INT3C_1E];
+    double u2[NROOTS];
+
+    for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+        GINT_g1e_save_u2<NROOTS>(g, u2, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega);
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        GINTwrite_int3c1e_ip2_charge_contracted<NROOTS>(g, output, minus_two_a, u2, AC, charge, ish, jsh, task_grid, i_l, j_l, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, ngrids);
+    }
+}
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
index e0ace197..3ee7c423 100644
--- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
@@ -113,6 +113,55 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba
     return 0;
 }
 
+static int GINTfill_int3c1e_ip1_density_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                         const double* density, const int* aoslice, const int nao,
+                                                         const double omega, const double* grid_points, const double* charge_exponents,
+                                                         const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int type_ij = i_l * 10 + j_l;
+    switch (type_ij) {
+    case 00: GINTfill_int3c1e_ip1_density_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 01: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 02: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 03: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, shell, nao, omega, grid_points, charge_exponents); break;
+    // case 04: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 4> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 10: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 11: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 12: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 13: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 20: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 21: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 22: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 30: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 31: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 40: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<4, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    default:
+        const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+        switch (nrys_roots) {
+        case 1: GINTfill_int3c1e_ip1_density_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 2: GINTfill_int3c1e_ip1_density_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 3: GINTfill_int3c1e_ip1_density_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 4: GINTfill_int3c1e_ip1_density_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 5: GINTfill_int3c1e_ip1_density_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots);
+            return 1;
+        }
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
 static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
                                                          const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
                                                          const double omega, const double* grid_points, const double* charge_exponents,
@@ -147,12 +196,62 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d
     return 0;
 }
 
+static int GINTfill_int3c1e_ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                        const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                        const int* gridslice,
+                                                        const double omega, const double* grid_points, const double* charge_exponents,
+                                                        const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int type_ij = i_l * 10 + j_l;
+    switch (type_ij) {
+    case 00: GINTfill_int3c1e_ip2_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 01: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 02: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 03: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 04: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 4> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 10: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 11: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 12: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 13: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 20: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 21: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 22: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 30: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 31: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 40: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<4, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    default:
+        const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+        switch (nrys_roots) {
+        case 1: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 2: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 3: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 4: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 5: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots);
+            return 1;
+        }
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
 extern "C" {
 int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache,
                         const double* grid_points, const double* charge_exponents, const int ngrids,
                         double* integrals,
                         const int* strides, const int* ao_offsets,
-                        const int* bins_locs_ij, int nbins,
+                        const int* bins_locs_ij, const int nbins,
                         const int cp_ij_id, const double omega)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -198,11 +297,63 @@ int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache
     return 0;
 }
 
+int GINTfill_int3c1e_ip1_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                            const double* grid_points, const double* charge_exponents, const int ngrids,
+                                            double* integral_charge_contracted,
+                                            const int* bins_locs_ij, const int nbins,
+                                            const int cp_ij_id,
+                                            const double* density, const int* aoslice, const int nao,
+                                            const double omega)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ip1_density_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                      density, aoslice, nao,
+                                                                      omega, grid_points, charge_exponents, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+
 int GINTfill_int3c1e_ip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
                                            const double* grid_points, const double* charge_exponents, const int ngrids,
                                            double* integral_charge_contracted,
                                            const int* strides, const int* ao_offsets,
-                                           const int* bins_locs_ij, int nbins,
+                                           const int* bins_locs_ij, const int nbins,
                                            const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -252,7 +403,7 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas
                                             const double* grid_points, const double* charge_exponents, const int ngrids,
                                             const double* dm_pair_ordered, const int* density_offset,
                                             double* integral_density_contracted,
-                                            const int* bins_locs_ij, int nbins,
+                                            const int* bins_locs_ij, const int nbins,
                                             const int cp_ij_id, const double omega, const int n_pair_sum_per_thread)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -302,4 +453,56 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas
 
     return 0;
 }
+
+int GINTfill_int3c1e_ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                           const double* grid_points, const double* charge_exponents, const int ngrids,
+                                           double* integral_charge_contracted,
+                                           const int* strides, const int* ao_offsets,
+                                           const int* bins_locs_ij, const int nbins,
+                                           const int cp_ij_id,
+                                           const int* gridslice,
+                                           const double omega)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                     strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+                                                                     gridslice, omega, grid_points, charge_exponents, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
 }
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
new file mode 100644
index 00000000..c961a9a2
--- /dev/null
+++ b/gpu4pyscf/lib/memcpy.py
@@ -0,0 +1,95 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cupy
+import numpy as np
+
+def find_contiguous_chunks(shape, h_strides, d_strides):
+    """
+    Find the largest contiguous chunk size based on strides and shape.
+    """
+    chunk_shape = []
+    chunk_size = 1
+    for dim, h_stride, d_stride in zip(reversed(shape), reversed(h_strides), reversed(d_strides)):
+        if h_stride == chunk_size and d_stride == chunk_size:
+            chunk_shape.append(dim)
+            chunk_size *= dim
+        else:
+            break
+    chunk_shape = tuple(reversed(chunk_shape))
+    return chunk_shape, chunk_size
+
+def copy_array(src_view, out=None):
+    ''' Copy cupy/numpy array to cupy array if out is None
+        Copy cupy/numpy array to cupy/numpy array (out)
+    '''
+    if out is None:
+        out = cupy.empty_like(src_view)
+    else:
+        # Ensure both arrays have the same shape
+        if src_view.shape != out.shape:
+            raise ValueError("Host and device views must have the same shape.")
+    return _copy_array(src_view, out)
+
+def _copy_array(src_view, dst_view):
+    ''' Copy data from cupy/numpy array to another cupy/numpy array
+    Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy
+    '''
+    if src_view.nbytes == 0:
+        return dst_view
+    
+    shape = src_view.shape
+    itemsize = src_view.itemsize
+    strides_src = [stride // itemsize for stride in src_view.strides]
+    strides_dst = [stride // itemsize for stride in dst_view.strides]
+
+    # Find the largest contiguous chunk
+    chunk_shape, chunk_size = find_contiguous_chunks(shape, strides_src, strides_dst)
+
+    if isinstance(src_view, cupy.ndarray):
+        src_data_ptr = src_view.data.ptr
+    else:
+        src_data_ptr = src_view.ctypes.data
+
+    if isinstance(dst_view, cupy.ndarray):
+        dst_data_ptr = dst_view.data.ptr
+    else:
+        dst_data_ptr = dst_view.ctypes.data
+
+    if isinstance(src_view, cupy.ndarray) and isinstance(dst_view, cupy.ndarray):
+        kind = cupy.cuda.runtime.memcpyDeviceToDevice
+    elif isinstance(src_view, cupy.ndarray) and isinstance(dst_view, np.ndarray):
+        kind = cupy.cuda.runtime.memcpyDeviceToHost
+    elif isinstance(src_view, np.ndarray) and isinstance(dst_view, cupy.ndarray):
+        kind = cupy.cuda.runtime.memcpyHostToDevice
+    else:
+        raise NotImplementedError
+        
+    assert len(chunk_shape) > 0
+
+    # Transfer data chunk-by-chunk
+    outer_dims = shape[:-len(chunk_shape)]
+    for outer_index in np.ndindex(*outer_dims):
+        # Compute offsets for the current outer slice
+        src_offset = sum(outer_index[i] * strides_src[i] for i in range(len(outer_dims)))
+        dst_offset = sum(outer_index[i] * strides_dst[i] for i in range(len(outer_dims)))
+        # Perform the memcpy for the contiguous chunk
+        cupy.cuda.runtime.memcpy(
+            dst_data_ptr + dst_offset * dst_view.itemsize,
+            src_data_ptr + src_offset * src_view.itemsize,
+            chunk_size * src_view.itemsize,
+            kind
+        )
+    return dst_view
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 0f406c82..b322f8ed 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -19,7 +19,8 @@
 from gpu4pyscf.lib.cupy_helper import (
     take_last2d, transpose_sum, krylov, unpack_sparse,
     add_sparse, takebak, empty_mapped, dist_matrix,
-    grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph)
+    grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph,
+    copy_array)
 
 class KnownValues(unittest.TestCase):
     def test_take_last2d(self):
@@ -214,6 +215,41 @@ def test_unpack_tril(self):
         ref[:,idx,idy] = atril
         assert abs(a - ref).max() < 1e-12
 
+    def test_copy_host2dev(self):
+        host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8)
+        host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array)
+        host_data = host_data.reshape(10,10,10)
+        host_data += numpy.random.rand(10,10,10)
+
+        device_data = cupy.empty_like(host_data)
+        host_view = host_data[:, 8:]  # Non-contiguous view on the host
+        device_view = device_data[:, 8:]  # Non-contiguous view on the device
+
+        copy_array(host_view, device_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        copy_array(host_view.copy(), device_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        device_view = copy_array(host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+    def test_copy_dev2host(self):
+        host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8)
+        host_data = numpy.ndarray(3*10**2, dtype=cupy.float64, buffer=host_array)
+        host_data = host_data.reshape(3,10,10)
+
+        device_data = cupy.zeros_like(host_data)
+        device_data += cupy.random.rand(3,10,10)
+        host_view = host_data[:, 8:]  # Non-contiguous view on the host
+        device_view = device_data[:, 8:]  # Non-contiguous view on the device
+
+        copy_array(device_view, host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        copy_array(device_view.copy(), host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
 if __name__ == "__main__":
     print("Full tests for cupy helper module")
     unittest.main()
diff --git a/gpu4pyscf/mp/mp2.py b/gpu4pyscf/mp/mp2.py
index c7fe059a..c12d68e4 100644
--- a/gpu4pyscf/mp/mp2.py
+++ b/gpu4pyscf/mp/mp2.py
@@ -349,6 +349,8 @@ def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
     # to_cpu can be reused only when __init__ still takes mf
     def to_cpu(self):
         mf = self._scf.to_cpu()
+        if mf.converged:
+            mf.kernel() # create intermediate variables if converged
         from importlib import import_module
         mod = import_module(self.__module__.replace('gpu4pyscf', 'pyscf'))
         cls = getattr(mod, self.__class__.__name__)
diff --git a/gpu4pyscf/mp/tests/test_mp2.py b/gpu4pyscf/mp/tests/test_mp2.py
index 1570dd27..b5127816 100644
--- a/gpu4pyscf/mp/tests/test_mp2.py
+++ b/gpu4pyscf/mp/tests/test_mp2.py
@@ -37,6 +37,7 @@ def setUpModule():
                  'O': 'cc-pvdz',}
     mol.build()
     mol.incore_anyway = True
+    mol.max_memory = 32000
     mf = scf.RHF(mol)
     mf.conv_tol = 1e-12
     mf.scf()
diff --git a/gpu4pyscf/properties/ir.py b/gpu4pyscf/properties/ir.py
index 61cfa72e..ec8b65b1 100644
--- a/gpu4pyscf/properties/ir.py
+++ b/gpu4pyscf/properties/ir.py
@@ -93,8 +93,9 @@ def eval_ir_freq_intensity(mf, hessian_obj):
     h1ao = hessian_obj.make_h1(mo_coeff, mo_occ, None, atmlst)
     # TODO: compact with hessian method, which can save one time cphf solve.
     # ! Different from PySCF, mo1 is all in mo!
+    fx = hessian_obj.gen_vind(mo_coeff, mo_occ)
     mo1, mo_e1 = hessian_obj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1ao,
-                                       None, atmlst, hessian_obj.max_memory, log)
+                                       fx, atmlst, hessian_obj.max_memory, log)  
     mo1 = cupy.asarray(mo1)
     mo_e1 = cupy.asarray(mo_e1)
 
diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py
index 236e3e10..1b098c8f 100644
--- a/gpu4pyscf/qmmm/pbc/itrf.py
+++ b/gpu4pyscf/qmmm/pbc/itrf.py
@@ -1009,7 +1009,7 @@ def calculate_h1e(self, h1_gpu):
             nao = mol.nao
             if mm_mol.charge_model == 'gaussian' and len(coords) != 0:
                 expnts = cp.hstack([mm_mol.get_zetas()] * len(Ls))[mask]
-                g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts).transpose(0,2,1)
+                g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts)
             elif mm_mol.charge_model == 'point' and len(coords) != 0:
                 raise RuntimeError("Not tested yet")
                 max_memory = self.max_memory - lib.current_memory()[0]
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 28d76b57..3a0497ff 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -391,6 +391,9 @@ def __init__(self, mol):
         self._opt_gpu = {None: None}
         self._eri = None # Note: self._eri requires large amount of memory
 
+    __getstate__, __setstate__ = pyscf_lib.generate_pickle_methods(
+        excludes=('_opt_gpu', '_eri', '_numint'))
+
     def check_sanity(self):
         s1e = self.get_ovlp()
         if isinstance(s1e, cupy.ndarray) and s1e.ndim == 2:
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index 2ecb5293..3d98ae5f 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -26,6 +26,7 @@
 from pyscf import __config__
 from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
 from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.__config__ import _num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff
@@ -51,7 +52,10 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
     cput0 = log.init_timer()
     if vhfopt is None:
         with mol.with_range_coulomb(omega):
-            vhfopt = _VHFOpt(mol).build()
+            groupsize = None
+            if _num_devices > 1:                
+                groupsize = jk.GROUP_SIZE
+            vhfopt = _VHFOpt(mol).build(group_size=groupsize)
     if omega is None:
         omega = mol.omega
 
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index e1ff1d34..0e328204 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -58,8 +58,9 @@
 SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
                    int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
 THREADS = 256
+GROUP_SIZE = 256
 
-def _jk_task(mol, dms, vhfopt, task_list,
+def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
              device_id=0, with_j=True, with_k=True, verbose=None):
     n_dm = dms.shape[0]
     nao, _ = vhfopt.coeff.shape
@@ -76,6 +77,10 @@ def _jk_task(mol, dms, vhfopt, task_list,
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
 
+        if hermi == 0:
+            # Contract the tril and triu parts separately
+            dms = cp.vstack([dms, dms.transpose(0,2,1)])
+        n_dm = dms.shape[0]
         tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
         q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
         s_ptr = lib.c_null_ptr()
@@ -103,41 +108,51 @@ def _jk_task(mol, dms, vhfopt, task_list,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                                l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
+        if with_j:
+            if hermi == 1:
+                vj *= 2.
+            else:
+                vj, vjT = vj[:n_dm//2], vj[n_dm//2:]
+                vj += vjT.transpose(0,2,1)
+        if with_k:
+            if hermi == 1:
+                vk = transpose_sum(vk)
+            else:
+                vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
+                vk += vkT.transpose(0,2,1)
     return vj, vk, kern_counts, timing_counter
 
 def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
@@ -157,9 +172,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
     dms = sandwich_dot(dms, vhfopt.coeff.T)
     dms = cp.asarray(dms, order='C')
-    if hermi == 0:
-        # Contract the tril and triu parts separately
-        dms = cp.vstack([dms, dms.transpose(0,2,1)])
+
     n_dm = dms.shape[0]
 
     assert with_j or with_k
@@ -171,7 +184,12 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
     n_groups = np.count_nonzero(uniq_l <= LMAX)
 
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1): 
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
@@ -183,8 +201,8 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
         for device_id in range(_num_devices):
             future = executor.submit(
                 _jk_task,
-                mol, dms, vhfopt, task_list[device_id],
-                with_j=with_j, with_k=with_k, verbose=verbose,
+                mol, dms, vhfopt, task_list[device_id], hermi=hermi,
+                with_j=with_j, with_k=with_k, verbose=verbose, 
                 device_id=device_id)
             futures.append(future)
 
@@ -210,28 +228,17 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     vj = vk = None
     if with_k:
         vk = reduce_to_device(vk_dist, inplace=True)
-        if hermi == 1:
-            vk = transpose_sum(vk)
-        else:
-            vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
-            vk += vkT.transpose(0,2,1)
         #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff)
         vk = sandwich_dot(vk, vhfopt.coeff)
-        vk = vk.reshape(dm.shape)
-
+        
     if with_j:
         vj = reduce_to_device(vj_dist, inplace=True)
-        if hermi == 1:
-            vj *= 2.
-        else:
-            vj, vjT = vj[:n_dm//2], vj[n_dm//2:]
-            vj += vjT.transpose(0,2,1)
         vj = transpose_sum(vj)
         #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff)
         vj = sandwich_dot(vj, vhfopt.coeff)
-        vj = vj.reshape(dm.shape)
 
     h_shls = vhfopt.h_shls
+
     if h_shls:
         cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
         log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
@@ -270,6 +277,11 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
             for i, v in enumerate(vk1):
                 vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
         log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
+    
+    if with_j:
+        vj = vj.reshape(dm.shape)
+    if with_k:
+        vk = vk.reshape(dm.shape)
 
     log.timer('vj and vk', *cput0)
     return vj, vk
diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py
index 81da0361..6d9bf87b 100644
--- a/gpu4pyscf/scf/soscf.py
+++ b/gpu4pyscf/scf/soscf.py
@@ -27,7 +27,7 @@
 from pyscf.soscf import ciah
 from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu
 from gpu4pyscf.lib import logger
-from gpu4pyscf.scf import hf, rohf, uhf
+from gpu4pyscf.scf import hf, rohf, uhf, _response_functions
 from gpu4pyscf.lib.cupy_helper import transpose_sum, contract
 from gpu4pyscf.lib import utils
 
diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py
index 530f6cc8..dd8f7b51 100644
--- a/gpu4pyscf/scf/tests/test_rhf.py
+++ b/gpu4pyscf/scf/tests/test_rhf.py
@@ -273,8 +273,8 @@ def test_chkfile(self):
         mf_copy = scf.RHF(mol)
         mf_copy.chkfile = ftmp.name
         dm_loaded = mf_copy.init_guess_by_chkfile()
-        assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise.
-
+        # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise.
+        assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) 
     # TODO:
     #test analyze
     #test mulliken_pop
diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py
index 231a36cf..78ae68eb 100644
--- a/gpu4pyscf/scf/tests/test_scf_jk.py
+++ b/gpu4pyscf/scf/tests/test_scf_jk.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
+from pyscf import lib, gto
 from gpu4pyscf.scf import jk
 from pyscf.scf.hf import get_jk
 
@@ -125,4 +125,3 @@ def test_jk_hermi0():
 
     assert abs(vj2+vj3 - vj1).max() < 1e-9
     assert abs(vk2+vk3 - vk1).max() < 1e-9
-    
\ No newline at end of file
diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py
index 924dfd2e..4a07bcc5 100644
--- a/gpu4pyscf/scf/tests/test_soscf.py
+++ b/gpu4pyscf/scf/tests/test_soscf.py
@@ -24,18 +24,18 @@ def setUpModule():
         verbose = 5,
         output = '/dev/null',
         atom = [
-        ["O" , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ],
+            ["O" , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. , 0.757  , 0.587)] ],
         basis = '6-31g')
 
     h2o_z1 = gto.M(
         verbose = 5,
         output = '/dev/null',
         atom = [
-        ["O" , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ],
+            ["O" , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. , 0.757  , 0.587)] ],
         basis = '6-31g',
         charge = 1,
         spin = 1,)
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 0544f751..3fe7cb6c 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -24,7 +24,7 @@
 from pyscf import lib
 from pyscf import gto
 from pyscf.grad import rhf as rhf_grad
-
+from gpu4pyscf.gto import int3c1e
 from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent
 from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
 from gpu4pyscf.lib.cupy_helper import contract
@@ -239,11 +239,16 @@ def grad_qv(pcmobj, dm):
     grid_coords = pcmobj.surface['grid_coords']
     q_sym       = pcmobj._intermediates['q_sym']
 
-    dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2)
-    dq  = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2)
+    intopt = int3c1e.VHFOpt(mol)
+    intopt.build(1e-14, aosym=False)
+    dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, 
+                          direct_scf_tol = 1e-14, charge_exponents = charge_exp**2,
+                          intopt=intopt)
+    dq  = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, 
+                          direct_scf_tol = 1e-14, charge_exponents = charge_exp**2,
+                          intopt=intopt)
 
     aoslice = mol.aoslice_by_atom()
-    aoslice = cupy.array(aoslice)
     dvj = 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
     dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
     de = dq + dvj
diff --git a/gpu4pyscf/solvent/grad/smd.py b/gpu4pyscf/solvent/grad/smd.py
index a3d850db..32ebc2ee 100644
--- a/gpu4pyscf/solvent/grad/smd.py
+++ b/gpu4pyscf/solvent/grad/smd.py
@@ -25,100 +25,10 @@
 from gpu4pyscf.solvent import pcm, smd
 from gpu4pyscf.solvent.grad import pcm as pcm_grad
 from gpu4pyscf.lib import logger
-from gpu4pyscf.lib.cupy_helper import contract
 
 def get_cds(smdobj):
     return smd.get_cds_legacy(smdobj)[1]
 
-"""
-def grad_solver(smdobj, dm):
-    '''
-    dE = 0.5*v* d(K^-1 R) *v + q*dv
-    v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
-    '''
-    mol = smdobj.mol
-    log = logger.new_logger(mol, mol.verbose)
-    t1 = log.init_timer()
-    if not smdobj._intermediates:
-        smdobj.build()
-    dm_cache = smdobj._intermediates.get('dm', None)
-    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
-        pass
-    else:
-        smdobj._get_vind(dm)
-
-    gridslice    = smdobj.surface['gslice_by_atom']
-    v_grids      = smdobj._intermediates['v_grids']
-    A            = smdobj._intermediates['A']
-    D            = smdobj._intermediates['D']
-    S            = smdobj._intermediates['S']
-    K            = smdobj._intermediates['K']
-    q            = smdobj._intermediates['q']
-
-    vK_1 = cupy.linalg.solve(K.T, v_grids)
-
-    dF, dA = pcm_grad.get_dF_dA(smdobj.surface)
-
-    with_D = smdobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']
-    dD, dS, dSii = pcm_grad.get_dD_dS(smdobj.surface, dF, with_D=with_D, with_S=True)
-
-    epsilon = smdobj.eps
-    de = cupy.zeros([smdobj.mol.natm,3])
-
-    def contract_bra(a, B, c):
-        ''' i,xij,j->jx '''
-        tmp = a.dot(B)
-        return (tmp * c).T
-
-    def contract_ket(a, B, c):
-        ''' i,xij,j->ix '''
-        tmp = B.dot(c)
-        return (a*tmp).T
-
-    # IEF-PCM and SS(V)PE formally are the same in gradient calculation
-    # dR = f_eps/(2*pi) * (dD*A + D*dA),
-    # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
-    f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
-    fac = f_epsilon/(2.0*np.pi)
-
-    Av = A*v_grids
-    de_dR  = 0.5*fac * contract_ket(vK_1, dD, Av)
-    de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
-    de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_D = vK_1.dot(D)
-    vK_1_Dv = vK_1_D * v_grids
-    de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA)
-
-    de_dS0  = 0.5*contract_ket(vK_1, dS, q)
-    de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
-    de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_q = vK_1 * q
-    de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
-
-    vK_1_DA = vK_1_D*A
-    de_dS1  = 0.5*contract_ket(vK_1_DA, dS, q)
-    de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
-    de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_DAq = vK_1_DA*q
-    de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
-
-    Sq = cupy.dot(S,q)
-    ASq = A*Sq
-    de_dD  = 0.5*contract_ket(vK_1, dD, ASq)
-    de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
-    de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA)   # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
-
-    de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
-    de += de_dR - de_dK
-
-    t1 = log.timer_debug1('grad solver', *t1)
-    return de.get()
-"""
 grad_solver = pcm_grad.grad_solver
 
 def make_grad_object(grad_method):
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index 29b41588..538cb859 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -22,12 +22,16 @@
 from pyscf import lib, gto
 from gpu4pyscf import scf
 from gpu4pyscf.solvent.pcm import PI
-from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc
+from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import contract
 from gpu4pyscf.lib import logger
+from gpu4pyscf.hessian.jk import _ao2mo
+from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto import int3c1e
+from gpu4pyscf.gto.int3c1e import int1e_grids
 
 def hess_nuc(pcmobj):
+    raise NotImplementedError("Not tested")
     if not pcmobj._intermediates:
         pcmobj.build()
     mol = pcmobj.mol
@@ -149,76 +153,282 @@ def pcm_grad_scanner(mol):
     pcmobj.reset(pmol)
     return de
 
-def fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
-    '''
-    dv_solv / da
-    slow version with finite difference
-    '''
-    log = logger.new_logger(pcmobj, verbose)
-    t1 = log.init_timer()
-    pmol = pcmobj.mol.copy()
-    mol = pmol.copy()
-    if atmlst is None:
-        atmlst = range(mol.natm)
-    nao, nmo = mo_coeff.shape
-    mocc = mo_coeff[:,mo_occ>0]
-    nocc = mocc.shape[1]
-    coords = mol.atom_coords(unit='Bohr')
-    def pcm_vmat_scanner(mol):
-        pcmobj.reset(mol)
-        e, v = pcmobj._get_vind(dm)
-        return v
+def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
+    assert pcmobj._intermediates is not None
 
-    mol.verbose = 0
-    vmat = cupy.empty([len(atmlst), 3, nao, nocc])
-    eps = 1e-3
-    for i0, ia in enumerate(atmlst):
-        for ix in range(3):
-            dv = numpy.zeros_like(coords)
-            dv[ia,ix] = eps
-            mol.set_geom_(coords + dv, unit='Bohr')
-            vmat0 = pcm_vmat_scanner(mol)
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    v_grids      = pcmobj._intermediates['v_grids']
+    A            = pcmobj._intermediates['A']
+    D            = pcmobj._intermediates['D']
+    S            = pcmobj._intermediates['S']
+    R            = pcmobj._intermediates['R']
+    q_sym        = pcmobj._intermediates['q_sym']
+    f_epsilon    = pcmobj._intermediates['f_epsilon']
 
-            mol.set_geom_(coords - dv, unit='Bohr')
-            vmat1 = pcm_vmat_scanner(mol)
+    ngrids = q_sym.shape[0]
 
-            grad_vmat = (vmat0 - vmat1)/2.0/eps
-            grad_vmat = contract("ij,jq->iq", grad_vmat, mocc)
-            grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff)
-            vmat[i0,ix] = grad_vmat
-    t1 = log.timer_debug1('computing solvent grad veff', *t1)
-    pcmobj.reset(pmol)
-    return vmat
+    def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
+        output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
+        for i_atom in atmlst:
+            g0,g1 = gridslice[i_atom]
+            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q)
+            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1])
+        return output
+    def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
+        return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice)
+
+    def get_dA_dot_q(dA, q, atmlst, gridslice):
+        return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
+
+    def get_dD_dot_q(dD, q, atmlst, gridslice):
+        output = cupy.zeros([len(atmlst), 3, ngrids])
+        for i_atom in atmlst:
+            g0,g1 = gridslice[i_atom]
+            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q)
+            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1])
+        return output
+    def get_dDT_dot_q(dD, q, atmlst, gridslice):
+        return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice)
+
+    if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
+        _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
+        dF, _ = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        # dR = 0, dK = dS
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice)
+
+        dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q)
+
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA)
+        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+        q = inverse_K @ R @ v_grids
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+        DA = D*A
+        dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+
+        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+        dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+
+        AS = (A * S.T).T # It's just diag(A) @ S
+        dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+        dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
+
+        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+
+        invKT_V = inverse_K.T @ v_grids
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+
+        DT_invKT_V = D.T @ invKT_V
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
+
+        dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice)
+        dKdxT_dot_invKT_V = dSdxT_dot_invKT_V
+
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_invKT_V)
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_invKT_V)
+
+        dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice)
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V
+
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V)
+
+        dqdx_fix_Vq *= -0.5
+
+    elif pcmobj.method.upper() in ['SS(V)PE']:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+        f_eps_over_4pi = f_epsilon/(4.0*PI)
 
-"""
-def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
+        def dK_dot_q(q):
+            dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+            DA = D*A
+            dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+
+            dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+
+            AS = (A * S.T).T # It's just diag(A) @ S
+            dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
+
+            dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
+
+            dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
+
+            dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q
+
+            return dKdx_dot_q
+
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+        q = inverse_K @ R @ v_grids
+        dKdx_dot_q = dK_dot_q(q)
+        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+
+        invKT_V = inverse_K.T @ v_grids
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+
+        DT_invKT_V = D.T @ invKT_V
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
+
+        dKdx_dot_invKT_V = dK_dot_q(invKT_V)
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V)
+
+        dqdx_fix_Vq *= -0.5
+
+    else:
+        raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
+
+    return dqdx_fix_Vq
+
+def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
+    assert pcmobj._intermediates is not None
+
+    mol = pcmobj.mol
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    charge_exp   = pcmobj.surface['charge_exp']
+    grid_coords  = pcmobj.surface['grid_coords']
+    R            = pcmobj._intermediates['R']
+
+    atom_coords = mol.atom_coords(unit='B')
+    atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
+    atom_coords = atom_coords[atmlst]
+    atom_charges = atom_charges[atmlst]
+    fakemol_nuc = gto.fakemol_for_charges(atom_coords)
+    fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
+    int2c2e_ip1 = mol._add_suffix('int2c2e_ip1')
+    v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol)
+    v_ng_ip1 = cupy.array(v_ng_ip1)
+    dV_on_charge_dx = cupy.einsum('dAq,A->Adq', v_ng_ip1, atom_charges)
+
+    v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip1, fakemol, fakemol_nuc)
+    v_ng_ip2 = cupy.array(v_ng_ip2)
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dV_on_charge_dx[i_atom,:,g0:g1] += cupy.einsum('dqA,A->dq', v_ng_ip2[:,g0:g1,:], atom_charges)
+
+    dIdA = int1e_grids_ip1(mol, grid_coords, dm = dm + dm.T, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    dV_on_charge_dx[atmlst,:,:] -= dIdA[atmlst,:,:]
+
+    dIdC = int1e_grids_ip2(mol, grid_coords, intopt = intopt_derivative, dm = dm, charge_exponents = charge_exp**2)
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1]
+
+    KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T)
+    dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx)
+
+    return dqdx_fix_K_R
+
+def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative):
+    K = pcmobj._intermediates['K']
+    inverse_K = cupy.linalg.inv(K)
+    return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative)
+
+def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
     '''
     dv_solv / da
-    slow version with finite difference
     '''
+    if not pcmobj._intermediates:
+        pcmobj.build()
+    dm_cache = pcmobj._intermediates.get('dm', None)
+    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+        pass
+    else:
+        pcmobj._get_vind(dm)
+    mol = pcmobj.mol
     log = logger.new_logger(pcmobj, verbose)
     t1 = log.init_timer()
-    pmol = pcmobj.mol.copy()
-    mol = pmol.copy()
-    if atmlst is None:
-        atmlst = range(mol.natm)
+
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     nocc = mocc.shape[1]
-    dm = cupy.dot(mocc, mocc.T) * 2
-    coords = mol.atom_coords(unit='Bohr')
 
-    # TODO: add those contributions
-    # contribution due to _get_v
-    # contribution due to linear solver
-    # contribution due to _get_vmat
+    if atmlst is None:
+        atmlst = range(mol.natm)
 
-    vmat = cupy.zeros([len(atmlst), 3, nao, nocc])
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    charge_exp   = pcmobj.surface['charge_exp']
+    grid_coords  = pcmobj.surface['grid_coords']
+    q_sym        = pcmobj._intermediates['q_sym']
+
+    aoslice = mol.aoslice_by_atom()
+    aoslice = numpy.array(aoslice)
+
+    intopt_fock = int3c1e.VHFOpt(mol)
+    intopt_fock.build(cutoff = 1e-14, aosym = True)
+    intopt_derivative = int3c1e.VHFOpt(mol)
+    intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+    dIdx_mo = cupy.empty([len(atmlst), 3, nmo, nocc])
+
+    dIdA = int1e_grids_ip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    for i_atom in atmlst:
+        p0,p1 = aoslice[i_atom, 2:]
+        # dIdx[i_atom, :, :, :] = 0
+        # dIdx[i_atom, :, p0:p1, :] += dIdA[:, p0:p1, :]
+        # dIdx[i_atom, :, :, p0:p1] += dIdA[:, p0:p1, :].transpose(0,2,1)
+        dIdA_mo = dIdA[:, p0:p1, :] @ mocc
+        dIdA_mo = cupy.einsum('ip,dpj->dij', mo_coeff[p0:p1, :].T, dIdA_mo)
+        dIdB_mo = dIdA[:, p0:p1, :].transpose(0,2,1) @ mocc[p0:p1, :]
+        dIdB_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdB_mo)
+        dIdx_mo[i_atom, :, :, :] = dIdA_mo + dIdB_mo
+
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dIdC = int1e_grids_ip2(mol, grid_coords[g0:g1,:], charges = q_sym[g0:g1],
+                               intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2)
+        dIdC_mo = dIdC @ mocc
+        dIdC_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdC_mo)
+        dIdx_mo[i_atom, :, :, :] += dIdC_mo
+
+    dV_on_molecule_dx_mo = dIdx_mo
+
+    dqdx = get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative)
+    for i_atom in atmlst:
+        for i_xyz in range(3):
+            dIdx_from_dqdx = int1e_grids(mol, grid_coords, charges = dqdx[i_atom, i_xyz, :],
+                                         intopt = intopt_fock, charge_exponents = charge_exp**2)
+            dV_on_molecule_dx_mo[i_atom, i_xyz, :, :] += mo_coeff.T @ dIdx_from_dqdx @ mocc
 
     t1 = log.timer_debug1('computing solvent grad veff', *t1)
-    pcmobj.reset(pmol)
-    return vmat
-"""
+    return dV_on_molecule_dx_mo
 
 def make_hess_object(hess_method):
     if hess_method.base.with_solvent.frozen:
@@ -273,7 +483,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -282,14 +492,38 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
             return h1aoa, h1aob
         else:
             raise NotImplementedError('Base object is not supported')
+        
+    def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
+        v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
+        if not self.base.with_solvent.equilibrium_solvation:
+            return v1vo
+        v_solvent = self.base.with_solvent._B_dot_x(dms)
+        if isinstance(self.base, scf.uhf.UHF):
+            n_dm = dms.shape[1]
+            mocca = mo_coeff[0][:,mo_occ[0]>0]
+            moccb = mo_coeff[1][:,mo_occ[1]>0]
+            moa, mob = mo_coeff
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
+            v1vo_sol = v_solvent[0] + v_solvent[1]
+            v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1)
+            v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1)
+        elif isinstance(self.base, scf.hf.RHF):
+            n_dm = dms.shape[0]
+            mocc = mo_coeff[:,mo_occ>0]
+            v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1)
+        else:
+            raise NotImplementedError('Base object is not supported')
+        return v1vo
+    
     def _finalize(self):
         # disable _finalize. It is called in grad_method.kernel method
         # where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
index 58cc637f..49897d74 100644
--- a/gpu4pyscf/solvent/hessian/smd.py
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -25,6 +25,7 @@
 from gpu4pyscf.solvent.grad import smd as smd_grad
 from gpu4pyscf.solvent.grad import pcm as pcm_grad
 from gpu4pyscf.solvent.hessian import pcm as pcm_hess
+from gpu4pyscf.hessian.jk import _ao2mo
 
 def get_cds(smdobj):
     mol = smdobj.mol
@@ -153,7 +154,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = pcm_hess.fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -162,14 +163,39 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
             return h1aoa, h1aob
         else:
             raise NotImplementedError('Base object is not supported')
+
+    def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
+        v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
+        if not self.base.with_solvent.equilibrium_solvation:
+            return v1vo
+        v_solvent = self.base.with_solvent._B_dot_x(dms)
+        
+        if isinstance(self.base, scf.uhf.UHF):
+            n_dm = dms.shape[1]
+            mocca = mo_coeff[0][:,mo_occ[0]>0]
+            moccb = mo_coeff[1][:,mo_occ[1]>0]
+            moa, mob = mo_coeff
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
+            v1vo_sol = v_solvent[0] + v_solvent[1]
+            v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1)
+            v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1)
+        elif isinstance(self.base, scf.hf.RHF):
+            n_dm = dms.shape[0]
+            mocc = mo_coeff[:,mo_occ>0]
+            v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1)
+        else:
+            raise NotImplementedError('Base object is not supported')
+        return v1vo
+
     def _finalize(self):
         # disable _finalize. It is called in grad_method.kernel method
         # where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index 33bf0e67..c7076f29 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -14,12 +14,15 @@
 
 import unittest
 import numpy as np
+import cupy as cp
 import pyscf
 import pytest
 from pyscf import gto
 from gpu4pyscf.solvent import pcm
 from gpu4pyscf import scf, dft
 from packaging import version
+from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat
+from gpu4pyscf.lib.cupy_helper import contract
 
 pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
 
@@ -50,7 +53,7 @@ def _make_mf(method='C-PCM', restricted=True, density_fit=True):
         mf = dft.rks.RKS(mol, xc=xc)
     else:
         mf = dft.uks.UKS(mol, xc=xc)
-    
+
     if density_fit:
         mf = mf.density_fit()
     mf = mf.PCM()
@@ -89,6 +92,44 @@ def _check_hessian(mf, h, ix=0, iy=0):
     print(f'Norm of H({ix},{iy}) diff, {np.linalg.norm(h[ix,:,iy,:] - h_fd)}')
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
+def _fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None):
+    '''
+    dv_solv / da
+    slow version with finite difference
+    '''
+    pmol = pcmobj.mol.copy()
+    mol = pmol.copy()
+    if atmlst is None:
+        atmlst = range(mol.natm)
+    nao, nmo = mo_coeff.shape
+    mocc = mo_coeff[:,mo_occ>0]
+    nocc = mocc.shape[1]
+    coords = mol.atom_coords(unit='Bohr')
+    def pcm_vmat_scanner(mol):
+        pcmobj.reset(mol)
+        e, v = pcmobj._get_vind(dm)
+        return v
+
+    mol.verbose = 0
+    vmat = cp.empty([len(atmlst), 3, nao, nocc])
+    eps = 1e-5
+    for i0, ia in enumerate(atmlst):
+        for ix in range(3):
+            dv = np.zeros_like(coords)
+            dv[ia,ix] = eps
+            mol.set_geom_(coords + dv, unit='Bohr')
+            vmat0 = pcm_vmat_scanner(mol)
+
+            mol.set_geom_(coords - dv, unit='Bohr')
+            vmat1 = pcm_vmat_scanner(mol)
+
+            grad_vmat = (vmat0 - vmat1)/2.0/eps
+            grad_vmat = contract("ij,jq->iq", grad_vmat, mocc)
+            grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff)
+            vmat[i0,ix] = grad_vmat
+    pcmobj.reset(pmol)
+    return vmat
+
 @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled")
 class KnownValues(unittest.TestCase):
     def test_df_hess_cpcm(self):
@@ -142,6 +183,48 @@ def test_uks_hess_iefpcm(self):
         _check_hessian(mf, h, ix=0, iy=0)
         _check_hessian(mf, h, ix=0, iy=1)
 
+    def test_grad_vmat_cpcm(self):
+        print("testing C-PCM dV_solv/dx")
+        mf = _make_mf(method='C-PCM')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_grad_vmat_iefpcm(self):
+        print("testing IEF-PCM dV_solv/dx")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_grad_vmat_ssvpe(self):
+        print("testing SS(V)PE dV_solv/dx")
+        mf = _make_mf(method='SS(V)PE')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
     @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
     def test_to_gpu(self):
         import pyscf
@@ -187,7 +270,7 @@ def test_to_cpu(self):
         mol.basis = 'sto-3g'
         mol.output = '/dev/null'
         mol.build(verbose=0)
-        
+
         mf = dft.RKS(mol, xc='b3lyp').PCM()
         mf.conv_tol = 1e-12
         mf.conv_tol_cpscf = 1e-7
@@ -209,6 +292,7 @@ def test_to_cpu(self):
         hessobj = hessobj.to_cpu()
         hess_cpu = hessobj.kernel()
         assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for Hessian of PCMs")
     unittest.main()
diff --git a/gpu4pyscf/tests/020_Vitamin_C.xyz b/gpu4pyscf/tests/020_Vitamin_C.xyz
new file mode 100644
index 00000000..e119c6d3
--- /dev/null
+++ b/gpu4pyscf/tests/020_Vitamin_C.xyz
@@ -0,0 +1,22 @@
+20
+Vitamin C
+C                 -0.07551087    1.68127663   -0.10745193
+O                  1.33621755    1.87147409   -0.39326987
+C                  1.67074668    2.95729545    0.49387976
+C                  0.41740763    3.77281969    0.78495878
+C                 -0.60481480    3.07572636    0.28906224
+H                 -0.19316298    1.01922455    0.72486113
+O                  0.35092043    5.03413298    1.45545728
+H                  0.42961487    5.74279041    0.81264173
+O                 -1.95331750    3.53349874    0.15912025
+H                 -2.55333895    2.78846397    0.23972698
+O                  2.81976302    3.20110148    0.94542226
+C                 -0.81772499    1.09230218   -1.32146482
+H                 -0.70955636    1.74951833   -2.15888136
+C                 -2.31163857    0.93420736   -0.98260166
+H                 -2.72575463    1.89080093   -0.74107186
+H                 -2.41980721    0.27699120   -0.14518512
+O                 -0.26428017   -0.18613595   -1.64425697
+H                 -0.72695910   -0.55328886   -2.40104423
+O                 -3.00083741    0.38730252   -2.10989934
+H                 -3.93210821    0.28874990   -1.89865997
diff --git a/gpu4pyscf/tests/057_Tamoxifen.xyz b/gpu4pyscf/tests/057_Tamoxifen.xyz
new file mode 100644
index 00000000..b51df6f5
--- /dev/null
+++ b/gpu4pyscf/tests/057_Tamoxifen.xyz
@@ -0,0 +1,59 @@
+57
+Tamoxifen
+C                 -1.42666665    1.35988349    0.01780185
+C                 -0.75139234    2.53486079    0.01780185
+C                 -2.96666665    1.35988349    0.01780185
+C                 -3.66418809    0.15160568    0.01780185
+C                 -3.66417225    2.56778831    0.01791304
+C                 -5.05890001    0.15132789    0.01723115
+H                 -3.11399504   -0.80051230    0.01693694
+C                 -5.05931013    2.56768367    0.01833813
+H                 -3.11457497    3.52019148    0.01809296
+C                 -5.75673144    1.35973487    0.01785909
+H                 -5.60876287   -0.80100973    0.01659711
+H                 -5.60899513    3.52021733    0.01884114
+H                 -6.85641138    1.35926586    0.01746817
+C                 -1.51874951    3.87006226    0.01780185
+C                 -1.63823871    4.60590036   -1.16149287
+C                 -2.09440347    4.34371845    1.19670832
+C                 -2.33266580    5.81544273   -1.16163975
+H                 -1.18363273    4.23258432   -2.09058400
+C                 -2.78991814    5.55312706    1.19651365
+H                 -2.00047584    3.76380313    2.12622693
+C                 -2.90901419    6.28907563    0.01764434
+H                 -2.42635385    6.39580205   -2.09099551
+H                 -3.24404320    5.92613353    2.12608927
+C                  0.78860766    2.53486079    0.01780185
+C                  1.48612910    3.74313859    0.01780185
+C                  1.48611327    1.32695597    0.01791304
+C                  2.88084102    3.74341639    0.01723115
+H                  0.93593606    4.69525658    0.01693694
+C                  2.88125115    1.32706060    0.01833813
+H                  0.93651599    0.37455279    0.01809296
+C                  3.57867246    2.53500940    0.01785909
+H                  3.43070389    4.69575400    0.01659711
+H                  3.43093615    0.37452694    0.01884114
+H                  4.67835240    2.53547842    0.01746817
+C                 -0.65930948    0.02468201    0.01780185
+H                 -0.04466478   -0.03344716   -0.85611628
+H                 -0.04386363   -0.03298673    0.89118649
+C                 -1.66236338   -1.14385651    0.01856968
+H                 -2.27713573   -1.08561745    0.89239069
+H                 -2.27768159   -1.08629703   -0.85491210
+H                 -1.12919956   -2.07156136    0.01876393
+O                 -3.62101473    7.52921876    0.01715974
+C                 -2.69982994    8.60858726    0.19402752
+H                 -2.03011871    8.64615667   -0.63962434
+H                 -2.14108178    8.45680900    1.09384076
+C                 -3.47584819    9.93535894    0.28927757
+H                 -4.05456450   10.07469158   -0.59986462
+H                 -4.12694690    9.90759901    1.13792346
+C                 -1.65137806   10.90285045    1.72438609
+H                 -2.24764703   10.40869908    2.46274761
+H                 -0.79110440   10.30633800    1.50302183
+H                 -1.33836538   11.85545774    2.09783276
+C                 -3.25771829   12.42866058    0.53449492
+H                 -2.56611180   13.24181825    0.60767325
+H                 -3.86037095   12.55070987   -0.34118410
+H                 -3.88574784   12.41553739    1.40069735
+N                 -2.48185199   11.10154878    0.44281205
diff --git a/gpu4pyscf/tests/095_Azadirachtin.xyz b/gpu4pyscf/tests/095_Azadirachtin.xyz
new file mode 100644
index 00000000..8c03f7bb
--- /dev/null
+++ b/gpu4pyscf/tests/095_Azadirachtin.xyz
@@ -0,0 +1,97 @@
+95
+Azadirachtin
+C                  0.24028400   -0.96854600    0.05735800
+C                  1.49955800   -0.38999400    0.79976500
+C                  1.84405900    1.11309900    0.52612700
+C                  0.61115200    2.06994900    0.41027500
+C                 -0.38718900    1.44909800   -0.58288900
+C                 -0.81198100    0.11367700    0.01403200
+H                  1.34464500   -0.48336800    1.89667000
+H                  0.90815500    3.09474100    0.10955200
+H                  0.07146500    1.40030200   -1.59457300
+H                 -1.08538000    0.33936800    1.09841400
+O                 -0.03234300    2.14051500    1.69756400
+H                  0.43832200    2.76739400    2.27637900
+O                 -1.64345600    2.15598600   -0.77527600
+C                 -2.74935800    1.17918600   -0.75355500
+H                 -3.33770900    1.41858200    0.14457000
+H                 -3.31820200    1.39744800   -1.66649800
+C                 -2.11058900   -0.22990000   -0.71994400
+C                  2.72998200    1.32748400   -0.70483200
+H                  2.81316800    2.38444500   -0.97758400
+H                  3.74960400    0.95856700   -0.53283000
+H                  2.35200700    0.78104000   -1.58051000
+C                  2.60140000   -1.34386400    0.30659000
+C                  0.84678200   -1.40613600   -1.29617000
+H                  0.88274800   -0.59319600   -2.03951200
+H                  0.38815200   -2.30137400   -1.74034600
+O                  2.22547600   -1.78168600   -1.02946800
+C                 -0.42290800   -2.19363100    0.75277400
+H                 -0.32012900   -3.08353500    0.10236100
+C                 -1.91400700   -2.00763500    1.11237500
+H                 -2.33420900   -2.99527800    1.38379200
+H                 -1.98093100   -1.38866600    2.03106200
+C                 -2.81353800   -1.37055100    0.02719800
+H                 -3.12020000   -2.14713900   -0.69849000
+C                 -1.82661295   -0.68751599   -2.16270012
+O                 -1.03585236   -0.24261727   -2.99355789
+O                 -2.59156054   -1.74766325   -2.52650357
+C                 -2.29916153   -2.14198817   -3.86960099
+H                 -2.96290254   -2.92828960   -4.16299137
+H                 -2.42740743   -1.30452633   -4.52313804
+H                 -1.28838658   -2.48820275   -3.92764814
+O                 -4.01986539   -0.90962471    0.64138134
+C                 -4.89301012   -1.93494775    0.80793745
+O                 -4.54153100   -3.05110585    0.42818050
+C                 -6.20834727   -1.48087047    1.46771166
+H                 -6.70958996   -0.78829922    0.82428269
+H                 -6.83594045   -2.33131805    1.63434212
+H                 -5.99341406   -1.00749899    2.40292455
+O                  0.29104226   -2.52037085    1.94793763
+C                  0.31248536   -3.86361432    2.13937213
+O                 -0.25336168   -4.56806573    1.30443072
+C                  1.07328546   -4.25938123    3.41849362
+C                  1.18469713   -5.56341278    3.77014145
+H                  0.75137836   -6.32562659    3.15681858
+C                  1.70966559   -3.16955354    4.30104443
+H                  2.52793619   -2.72004059    3.77829081
+H                  0.97813456   -2.42251044    4.52839648
+H                  2.06508607   -3.60889199    5.20964665
+C                  1.93754031   -5.94957419    5.05688405
+H                  1.46239499   -5.49165555    5.89917107
+H                  1.92238977   -7.01309886    5.17344190
+H                  2.95091533   -5.61227499    4.99207421
+C                  3.99823568   -0.71610148    0.14421916
+O                  4.54063921    0.18499764    0.78248292
+O                  4.69984280   -1.27738694   -0.87269582
+O                  2.69271189   -2.53050618    1.09933364
+H                  3.60067733   -2.84219679    1.10624230
+C                  5.98847134   -0.66885730   -0.99113633
+H                  6.49970371   -0.73075570   -0.05320774
+H                  6.55618159   -1.17887968   -1.74112449
+H                  5.87374685    0.35839671   -1.26770006
+C                  2.63486992    1.58151749    1.76176538
+C                  2.13434327    2.21842175    3.11643757
+C                  3.90461234    2.45387090    1.74128354
+O                  2.44467967    0.78466796    2.96396625
+C                  3.35337126    2.98709450    3.79243900
+C                  0.74513758    2.60743687    3.44136489
+O                  5.00327683    3.19196370    1.11718214
+C                  4.47769203    2.16352749    3.16877423
+H                  3.15573566    3.35599353    1.51547111
+C                  3.84794511    4.41584726    3.25643717
+H                  3.24116904    2.99889070    4.88162906
+H                  0.00697023    1.93995296    2.97068106
+H                  0.55491721    2.57388288    4.52449549
+H                  0.54134467    3.63458255    3.09753074
+C                  4.84981258    4.42246076    1.92099071
+H                  4.49637929    1.09030804    3.43212004
+H                  5.51163803    2.50195502    3.32489990
+C                  4.76579887    5.04464694    4.26535741
+O                  2.75093022    5.20578033    2.83652107
+H                  4.60685318    5.22136931    1.20459035
+O                  6.17282363    4.70855901    2.47193815
+H                  4.42807865    5.31783232    5.24674785
+C                  6.01838353    5.12565144    3.78006571
+H                  2.50011685    5.87405238    3.50751412
+H                  6.95619123    5.44887224    4.20308201
diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json
new file mode 100644
index 00000000..1c5a9fc2
--- /dev/null
+++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json
@@ -0,0 +1,873 @@
+{
+    "machine_info": {
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker",
+        "processor": "",
+        "machine": "x86_64",
+        "python_compiler": "GCC 10.2.1 20210110",
+        "python_implementation": "CPython",
+        "python_implementation_version": "3.9.2",
+        "python_version": "3.9.2",
+        "python_build": [
+            "default",
+            "Feb 28 2021 17:03:44"
+        ],
+        "release": "5.4.143.bsk.7-amd64",
+        "system": "Linux",
+        "cpu": {
+            "python_version": "3.9.2.final.0 (64 bit)",
+            "cpuinfo_version": [
+                9,
+                0,
+                0
+            ],
+            "cpuinfo_version_string": "9.0.0",
+            "arch": "X86_64",
+            "bits": 64,
+            "count": 96,
+            "arch_string_raw": "x86_64",
+            "vendor_id_raw": "GenuineIntel",
+            "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz",
+            "hz_advertised_friendly": "2.4000 GHz",
+            "hz_actual_friendly": "3.1000 GHz",
+            "hz_advertised": [
+                2400000000,
+                0
+            ],
+            "hz_actual": [
+                3100005000,
+                0
+            ],
+            "stepping": 7,
+            "model": 85,
+            "family": 6,
+            "flags": [
+                "3dnowprefetch",
+                "abm",
+                "acpi",
+                "adx",
+                "aes",
+                "aperfmperf",
+                "apic",
+                "arat",
+                "arch_capabilities",
+                "arch_perfmon",
+                "art",
+                "avx",
+                "avx2",
+                "avx512_vnni",
+                "avx512bw",
+                "avx512cd",
+                "avx512dq",
+                "avx512f",
+                "avx512vl",
+                "avx512vnni",
+                "bmi1",
+                "bmi2",
+                "bts",
+                "cat_l3",
+                "cdp_l3",
+                "clflush",
+                "clflushopt",
+                "clwb",
+                "cmov",
+                "constant_tsc",
+                "cpuid",
+                "cpuid_fault",
+                "cqm",
+                "cqm_llc",
+                "cqm_mbm_local",
+                "cqm_mbm_total",
+                "cqm_occup_llc",
+                "cx16",
+                "cx8",
+                "dca",
+                "de",
+                "ds_cpl",
+                "dtes64",
+                "dtherm",
+                "dts",
+                "epb",
+                "ept",
+                "ept_ad",
+                "erms",
+                "est",
+                "f16c",
+                "flexpriority",
+                "flush_l1d",
+                "fma",
+                "fpu",
+                "fsgsbase",
+                "fxsr",
+                "ht",
+                "hwp",
+                "hwp_act_window",
+                "hwp_epp",
+                "hwp_pkg_req",
+                "ibpb",
+                "ibrs",
+                "ibrs_enhanced",
+                "ida",
+                "intel_ppin",
+                "intel_pt",
+                "invpcid",
+                "invpcid_single",
+                "lahf_lm",
+                "lm",
+                "mba",
+                "mca",
+                "mce",
+                "md_clear",
+                "mmx",
+                "movbe",
+                "mpx",
+                "msr",
+                "mtrr",
+                "nonstop_tsc",
+                "nopl",
+                "nx",
+                "ospke",
+                "osxsave",
+                "pae",
+                "pat",
+                "pbe",
+                "pcid",
+                "pclmulqdq",
+                "pdcm",
+                "pdpe1gb",
+                "pebs",
+                "pge",
+                "pku",
+                "pln",
+                "pni",
+                "popcnt",
+                "pqe",
+                "pqm",
+                "pse",
+                "pse36",
+                "pts",
+                "rdrand",
+                "rdrnd",
+                "rdseed",
+                "rdt_a",
+                "rdtscp",
+                "rep_good",
+                "sdbg",
+                "sep",
+                "smap",
+                "smep",
+                "smx",
+                "ss",
+                "ssbd",
+                "sse",
+                "sse2",
+                "sse4_1",
+                "sse4_2",
+                "ssse3",
+                "stibp",
+                "syscall",
+                "tm",
+                "tm2",
+                "tpr_shadow",
+                "tsc",
+                "tsc_adjust",
+                "tsc_deadline_timer",
+                "tscdeadline",
+                "vme",
+                "vmx",
+                "vnmi",
+                "vpid",
+                "x2apic",
+                "xgetbv1",
+                "xsave",
+                "xsavec",
+                "xsaveopt",
+                "xsaves",
+                "xtopology",
+                "xtpr"
+            ],
+            "l3_cache_size": 37486592,
+            "l2_cache_size": 50331648,
+            "l1_data_cache_size": "1.5 MiB",
+            "l1_instruction_cache_size": "1.5 MiB",
+            "l2_cache_line_size": 256,
+            "l2_cache_associativity": 6
+        }
+    },
+    "commit_info": {
+        "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13",
+        "time": "2025-01-05T23:21:10+00:00",
+        "author_time": "2025-01-05T23:21:10+00:00",
+        "dirty": false,
+        "project": "gpu4pyscf",
+        "branch": "benchmark_ci"
+    },
+    "benchmarks": [
+        {
+            "group": null,
+            "name": "test_df_rb3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 2.725358221679926,
+                "max": 2.835785958915949,
+                "mean": 2.782431565846006,
+                "stddev": 0.055307723110869685,
+                "rounds": 3,
+                "median": 2.7861505169421434,
+                "iqr": 0.08282080292701721,
+                "q1": 2.7405562954954803,
+                "q3": 2.8233770984224975,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.725358221679926,
+                "hd15iqr": 2.835785958915949,
+                "ops": 0.35939787783997024,
+                "total": 8.347294697538018,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 4.394210334867239,
+                "max": 4.473813105374575,
+                "mean": 4.42994485112528,
+                "stddev": 0.04041990275091787,
+                "rounds": 3,
+                "median": 4.4218111131340265,
+                "iqr": 0.05970207788050175,
+                "q1": 4.401110529433936,
+                "q3": 4.460812607314438,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 4.394210334867239,
+                "hd15iqr": 4.473813105374575,
+                "ops": 0.22573644449455918,
+                "total": 13.28983455337584,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 43.774112831801176,
+                "max": 43.774112831801176,
+                "mean": 43.774112831801176,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 43.774112831801176,
+                "iqr": 0.0,
+                "q1": 43.774112831801176,
+                "q3": 43.774112831801176,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 43.774112831801176,
+                "hd15iqr": 43.774112831801176,
+                "ops": 0.022844552072189946,
+                "total": 43.774112831801176,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 40.097773076966405,
+                "max": 40.15744375810027,
+                "mean": 40.11991243995726,
+                "stddev": 0.03267769513443882,
+                "rounds": 3,
+                "median": 40.10452048480511,
+                "iqr": 0.04475301085039973,
+                "q1": 40.09945992892608,
+                "q3": 40.14421293977648,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 40.097773076966405,
+                "hd15iqr": 40.15744375810027,
+                "ops": 0.024925278725285903,
+                "total": 120.35973731987178,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 48.99313645064831,
+                "max": 49.26371451281011,
+                "mean": 49.142610578487314,
+                "stddev": 0.13750190122656403,
+                "rounds": 3,
+                "median": 49.17098077200353,
+                "iqr": 0.20293354662135243,
+                "q1": 49.037597530987114,
+                "q3": 49.240531077608466,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 48.99313645064831,
+                "hd15iqr": 49.26371451281011,
+                "ops": 0.02034893930599935,
+                "total": 147.42783173546195,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 615.0911720395088,
+                "max": 615.0911720395088,
+                "mean": 615.0911720395088,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 615.0911720395088,
+                "iqr": 0.0,
+                "q1": 615.0911720395088,
+                "q3": 615.0911720395088,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 615.0911720395088,
+                "hd15iqr": 615.0911720395088,
+                "ops": 0.0016257752435044988,
+                "total": 615.0911720395088,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 18.244548039510846,
+                "max": 18.375720830634236,
+                "mean": 18.312131161491077,
+                "stddev": 0.06567751542153955,
+                "rounds": 3,
+                "median": 18.316124614328146,
+                "iqr": 0.09837959334254265,
+                "q1": 18.26244218321517,
+                "q3": 18.360821776557714,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 18.244548039510846,
+                "hd15iqr": 18.375720830634236,
+                "ops": 0.05460860842362896,
+                "total": 54.93639348447323,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_grad_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 30.697130125015974,
+                "max": 30.711910048499703,
+                "mean": 30.70534764789045,
+                "stddev": 0.00752768934207856,
+                "rounds": 3,
+                "median": 30.70700277015567,
+                "iqr": 0.011084942612797022,
+                "q1": 30.699598286300898,
+                "q3": 30.710683228913695,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 30.697130125015974,
+                "hd15iqr": 30.711910048499703,
+                "ops": 0.03256761693329022,
+                "total": 92.11604294367135,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_hessian_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 667.9882875829935,
+                "max": 667.9882875829935,
+                "mean": 667.9882875829935,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 667.9882875829935,
+                "iqr": 0.0,
+                "q1": 667.9882875829935,
+                "q3": 667.9882875829935,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 667.9882875829935,
+                "hd15iqr": 667.9882875829935,
+                "ops": 0.0014970322363260838,
+                "total": 667.9882875829935,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 460.72668202780187,
+                "max": 461.77398146130145,
+                "mean": 461.4145879279822,
+                "stddev": 0.5959440470695604,
+                "rounds": 3,
+                "median": 461.7431002948433,
+                "iqr": 0.785474575124681,
+                "q1": 460.98078659456223,
+                "q3": 461.7662611696869,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 460.72668202780187,
+                "hd15iqr": 461.77398146130145,
+                "ops": 0.0021672483405662944,
+                "total": 1384.2437637839466,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_grad_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 552.0836905632168,
+                "max": 553.4436832498759,
+                "mean": 552.8364644367248,
+                "stddev": 0.6915813282891417,
+                "rounds": 3,
+                "median": 552.9820194970816,
+                "iqr": 1.0199945149943233,
+                "q1": 552.308272796683,
+                "q3": 553.3282673116773,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 552.0836905632168,
+                "hd15iqr": 553.4436832498759,
+                "ops": 0.0018088531859396832,
+                "total": 1658.5093933101743,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 1.6017291732132435,
+                "max": 1.647629827260971,
+                "mean": 1.6208390643199284,
+                "stddev": 0.02389486042236203,
+                "rounds": 3,
+                "median": 1.613158192485571,
+                "iqr": 0.03442549053579569,
+                "q1": 1.6045864280313253,
+                "q3": 1.639011918567121,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 1.6017291732132435,
+                "hd15iqr": 1.647629827260971,
+                "ops": 0.6169643994973554,
+                "total": 4.8625171929597855,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 2.1184212770313025,
+                "max": 2.20925628952682,
+                "mean": 2.15202548665305,
+                "stddev": 0.04981377124137081,
+                "rounds": 3,
+                "median": 2.1283988934010267,
+                "iqr": 0.0681262593716383,
+                "q1": 2.1209156811237335,
+                "q3": 2.189041940495372,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.1184212770313025,
+                "hd15iqr": 2.20925628952682,
+                "ops": 0.46467851157063006,
+                "total": 6.456076459959149,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 16.1142161693424,
+                "max": 16.1142161693424,
+                "mean": 16.1142161693424,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 16.1142161693424,
+                "iqr": 0.0,
+                "q1": 16.1142161693424,
+                "q3": 16.1142161693424,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 16.1142161693424,
+                "hd15iqr": 16.1142161693424,
+                "ops": 0.06205700541007504,
+                "total": 16.1142161693424,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_631gs_large",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_large",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 55.4929311927408,
+                "max": 56.77203128859401,
+                "mean": 56.066467080265284,
+                "stddev": 0.6496905970719544,
+                "rounds": 3,
+                "median": 55.934438759461045,
+                "iqr": 0.9593250718899071,
+                "q1": 55.60330808442086,
+                "q3": 56.56263315631077,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 55.4929311927408,
+                "hd15iqr": 56.77203128859401,
+                "ops": 0.01783597312397784,
+                "total": 168.19940124079585,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_631gs_grad_large",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_grad_large",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 70.14288471080363,
+                "max": 70.61111964285374,
+                "mean": 70.3403081515183,
+                "stddev": 0.24259089508559126,
+                "rounds": 3,
+                "median": 70.26692010089755,
+                "iqr": 0.3511761990375817,
+                "q1": 70.17389355832711,
+                "q3": 70.52506975736469,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 70.14288471080363,
+                "hd15iqr": 70.61111964285374,
+                "ops": 0.014216599646477592,
+                "total": 211.02092445455492,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 2.51676319912076,
+                "max": 2.569052016362548,
+                "mean": 2.540054644147555,
+                "stddev": 0.02660729798277223,
+                "rounds": 3,
+                "median": 2.5343487169593573,
+                "iqr": 0.03921661293134093,
+                "q1": 2.5211595785804093,
+                "q3": 2.56037619151175,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.51676319912076,
+                "hd15iqr": 2.569052016362548,
+                "ops": 0.393692317723976,
+                "total": 7.620163932442665,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 3,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 3.7774324007332325,
+                "max": 3.8614633549004793,
+                "mean": 3.8227184594919286,
+                "stddev": 0.04239564161614309,
+                "rounds": 3,
+                "median": 3.8292596228420734,
+                "iqr": 0.06302321562543511,
+                "q1": 3.7903892062604427,
+                "q3": 3.853412421885878,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 3.7774324007332325,
+                "hd15iqr": 3.8614633549004793,
+                "ops": 0.26159394436097405,
+                "total": 11.468155378475785,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 122.75680537335575,
+                "max": 122.75680537335575,
+                "mean": 122.75680537335575,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 122.75680537335575,
+                "iqr": 0.0,
+                "q1": 122.75680537335575,
+                "q3": 122.75680537335575,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 122.75680537335575,
+                "hd15iqr": 122.75680537335575,
+                "ops": 0.00814618787902287,
+                "total": 122.75680537335575,
+                "iterations": 1
+            }
+        }
+    ],
+    "datetime": "2025-01-06T03:31:22.391433+00:00",
+    "version": "5.1.0"
+}
\ No newline at end of file
diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json
new file mode 100644
index 00000000..7bfabd8a
--- /dev/null
+++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json
@@ -0,0 +1,418 @@
+{
+    "machine_info": {
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker",
+        "processor": "",
+        "machine": "x86_64",
+        "python_compiler": "GCC 10.2.1 20210110",
+        "python_implementation": "CPython",
+        "python_implementation_version": "3.9.2",
+        "python_version": "3.9.2",
+        "python_build": [
+            "default",
+            "Feb 28 2021 17:03:44"
+        ],
+        "release": "5.4.143.bsk.7-amd64",
+        "system": "Linux",
+        "cpu": {
+            "python_version": "3.9.2.final.0 (64 bit)",
+            "cpuinfo_version": [
+                9,
+                0,
+                0
+            ],
+            "cpuinfo_version_string": "9.0.0",
+            "arch": "X86_64",
+            "bits": 64,
+            "count": 96,
+            "arch_string_raw": "x86_64",
+            "vendor_id_raw": "GenuineIntel",
+            "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz",
+            "hz_advertised_friendly": "2.4000 GHz",
+            "hz_actual_friendly": "3.1000 GHz",
+            "hz_advertised": [
+                2400000000,
+                0
+            ],
+            "hz_actual": [
+                3100012000,
+                0
+            ],
+            "stepping": 7,
+            "model": 85,
+            "family": 6,
+            "flags": [
+                "3dnowprefetch",
+                "abm",
+                "acpi",
+                "adx",
+                "aes",
+                "aperfmperf",
+                "apic",
+                "arat",
+                "arch_capabilities",
+                "arch_perfmon",
+                "art",
+                "avx",
+                "avx2",
+                "avx512_vnni",
+                "avx512bw",
+                "avx512cd",
+                "avx512dq",
+                "avx512f",
+                "avx512vl",
+                "avx512vnni",
+                "bmi1",
+                "bmi2",
+                "bts",
+                "cat_l3",
+                "cdp_l3",
+                "clflush",
+                "clflushopt",
+                "clwb",
+                "cmov",
+                "constant_tsc",
+                "cpuid",
+                "cpuid_fault",
+                "cqm",
+                "cqm_llc",
+                "cqm_mbm_local",
+                "cqm_mbm_total",
+                "cqm_occup_llc",
+                "cx16",
+                "cx8",
+                "dca",
+                "de",
+                "ds_cpl",
+                "dtes64",
+                "dtherm",
+                "dts",
+                "epb",
+                "ept",
+                "ept_ad",
+                "erms",
+                "est",
+                "f16c",
+                "flexpriority",
+                "flush_l1d",
+                "fma",
+                "fpu",
+                "fsgsbase",
+                "fxsr",
+                "ht",
+                "hwp",
+                "hwp_act_window",
+                "hwp_epp",
+                "hwp_pkg_req",
+                "ibpb",
+                "ibrs",
+                "ibrs_enhanced",
+                "ida",
+                "intel_ppin",
+                "intel_pt",
+                "invpcid",
+                "invpcid_single",
+                "lahf_lm",
+                "lm",
+                "mba",
+                "mca",
+                "mce",
+                "md_clear",
+                "mmx",
+                "movbe",
+                "mpx",
+                "msr",
+                "mtrr",
+                "nonstop_tsc",
+                "nopl",
+                "nx",
+                "ospke",
+                "osxsave",
+                "pae",
+                "pat",
+                "pbe",
+                "pcid",
+                "pclmulqdq",
+                "pdcm",
+                "pdpe1gb",
+                "pebs",
+                "pge",
+                "pku",
+                "pln",
+                "pni",
+                "popcnt",
+                "pqe",
+                "pqm",
+                "pse",
+                "pse36",
+                "pts",
+                "rdrand",
+                "rdrnd",
+                "rdseed",
+                "rdt_a",
+                "rdtscp",
+                "rep_good",
+                "sdbg",
+                "sep",
+                "smap",
+                "smep",
+                "smx",
+                "ss",
+                "ssbd",
+                "sse",
+                "sse2",
+                "sse4_1",
+                "sse4_2",
+                "ssse3",
+                "stibp",
+                "syscall",
+                "tm",
+                "tm2",
+                "tpr_shadow",
+                "tsc",
+                "tsc_adjust",
+                "tsc_deadline_timer",
+                "tscdeadline",
+                "vme",
+                "vmx",
+                "vnmi",
+                "vpid",
+                "x2apic",
+                "xgetbv1",
+                "xsave",
+                "xsavec",
+                "xsaveopt",
+                "xsaves",
+                "xtopology",
+                "xtpr"
+            ],
+            "l3_cache_size": 37486592,
+            "l2_cache_size": 50331648,
+            "l1_data_cache_size": "1.5 MiB",
+            "l1_instruction_cache_size": "1.5 MiB",
+            "l2_cache_line_size": 256,
+            "l2_cache_associativity": 6
+        }
+    },
+    "commit_info": {
+        "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13",
+        "time": "2025-01-05T23:21:10+00:00",
+        "author_time": "2025-01-05T23:21:10+00:00",
+        "dirty": false,
+        "project": "gpu4pyscf",
+        "branch": "benchmark_ci"
+    },
+    "benchmarks": [
+        {
+            "group": null,
+            "name": "test_df_ub3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 6.552961312234402,
+                "max": 6.817228589206934,
+                "mean": 6.699132799791793,
+                "stddev": 0.10053109169956066,
+                "rounds": 6,
+                "median": 6.730765865184367,
+                "iqr": 0.15081804990768433,
+                "q1": 6.606128558516502,
+                "q3": 6.756946608424187,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 6.552961312234402,
+                "hd15iqr": 6.817228589206934,
+                "ops": 0.14927305218237794,
+                "total": 40.19479679875076,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_ub3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 13.294025084003806,
+                "max": 14.571726197376847,
+                "mean": 13.735415458368758,
+                "stddev": 0.5932420341119666,
+                "rounds": 6,
+                "median": 13.415598810650408,
+                "iqr": 1.1223390139639378,
+                "q1": 13.296602416783571,
+                "q3": 14.418941430747509,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 13.294025084003806,
+                "hd15iqr": 14.571726197376847,
+                "ops": 0.07280449601476865,
+                "total": 82.41249275021255,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_ub3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 93.588756557554,
+                "max": 93.588756557554,
+                "mean": 93.588756557554,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 93.588756557554,
+                "iqr": 0.0,
+                "q1": 93.588756557554,
+                "q3": 93.588756557554,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 93.588756557554,
+                "hd15iqr": 93.588756557554,
+                "ops": 0.01068504419529319,
+                "total": 93.588756557554,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 6.713842295110226,
+                "max": 7.0260709673166275,
+                "mean": 6.852823034239312,
+                "stddev": 0.11983568503202911,
+                "rounds": 6,
+                "median": 6.869919722899795,
+                "iqr": 0.19665820337831974,
+                "q1": 6.720263646915555,
+                "q3": 6.916921850293875,
+                "iqr_outliers": 0,
+                "stddev_outliers": 3,
+                "outliers": "3;0",
+                "ld15iqr": 6.713842295110226,
+                "hd15iqr": 7.0260709673166275,
+                "ops": 0.14592526247994722,
+                "total": 41.11693820543587,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 7.483015248551965,
+                "max": 7.855705849826336,
+                "mean": 7.595327176774542,
+                "stddev": 0.14647552264068445,
+                "rounds": 6,
+                "median": 7.529973562806845,
+                "iqr": 0.19051661528646946,
+                "q1": 7.491389110684395,
+                "q3": 7.681905725970864,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 7.483015248551965,
+                "hd15iqr": 7.855705849826336,
+                "ops": 0.13165989781952533,
+                "total": 45.57196306064725,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 61.551909405738115,
+                "max": 61.551909405738115,
+                "mean": 61.551909405738115,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 61.551909405738115,
+                "iqr": 0.0,
+                "q1": 61.551909405738115,
+                "q3": 61.551909405738115,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 61.551909405738115,
+                "hd15iqr": 61.551909405738115,
+                "ops": 0.016246449698387032,
+                "total": 61.551909405738115,
+                "iterations": 1
+            }
+        }
+    ],
+    "datetime": "2025-01-06T03:46:22.404689+00:00",
+    "version": "5.1.0"
+}
\ No newline at end of file
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
new file mode 100644
index 00000000..c367ac90
--- /dev/null
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -0,0 +1,289 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import pyscf
+import pytest
+from gpu4pyscf.dft import rks
+CUDA_VISIBLE_DEVICES=0
+# Any task taking more than 1000s will be marked as 'slow'
+
+# How to run
+# 1. run test only
+# pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20
+
+# 2. benchmark less expensive tasks
+# pytest test_benchmark_rks.py -v -m "not slow"
+
+# 3. benchmark all the tests
+# pytest test_benchmark_rks.py -v
+
+# 4. save benchmark results
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100
+
+# 5. compare benchmark results, fail if performance regresses by more than 10%
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/
+
+current_folder = os.path.dirname(os.path.abspath(__file__))
+small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
+medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz')
+large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz')
+
+def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    return mf.kernel()
+
+def run_rb3lyp_grad(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.kernel()
+    g = mf.nuc_grad_method().kernel()
+    return g
+
+def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.conv_tol_cpscf = 1e-6
+    mf.kernel()
+    hobj = mf.Hessian()
+    if with_df:
+        hobj.auxbasis_response = 2
+    h = hobj.kernel()
+    return h
+
+#######
+# DF
+#######
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_rb3lyp_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.7587394873290885, atol=1e-4, rtol=1e-16)
+
+################
+# Direct SCF
+################
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_rb3lyp_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16)
+
+####################
+# Medium molecule
+####################
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_medium(benchmark):
+    e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp medium')
+    assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_grad_medium(benchmark):
+    g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad medium')
+    assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_rb3lyp_hessian_medium(benchmark):
+    h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian medium')
+    assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16)
+
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_rb3lyp_medium(benchmark):
+    e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp medium')
+    assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_rb3lyp_grad_medium(benchmark):
+    g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad medium')
+    assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16)
+@pytest.mark.slow
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_rb3lyp_hessian_medium(benchmark):
+    h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian medium')
+    assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4, rtol=1e-16)
+
+####################
+# large molecule
+####################
+@pytest.mark.high_memory
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp large')
+    assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16)
+@pytest.mark.high_memory
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad large')
+    assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16)
+@pytest.mark.high_memory
+@pytest.mark.slow
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_rb3lyp_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian large')
+    assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16)
+@pytest.mark.slow
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp large')
+    assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16)
+@pytest.mark.slow
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad large')
+    assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5, rtol=1e-16)
+
+# Hessian for large molecule with large basis set is too slow
+'''
+@pytest.mark.slow
+@pytest.mark.benchmark
+def test_rb3lyp_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian large')
+    print(np.linalg.norm(h))
+'''
+
+#####################
+# Small basis set
+#####################
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_631gs(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs')
+    assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_631gs_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs grad')
+    assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_rb3lyp_631gs_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs hessian')
+    assert np.isclose(np.linalg.norm(h), 3.9071846157996553, atol=1e-4, rtol=1e-16)
+
+#########################################
+# Small basis set for large molecule
+#########################################
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp_631gs_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
+    print('testing rb3lyp 631gs large')
+    assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_rb3lyp_631gs_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False)
+    print('testing df rb3lyp 631gs grad large')
+    assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16)
+@pytest.mark.slow
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_rb3lyp_631gs_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False)
+    print('testing df rb3lyp 631gs hessian large')
+    assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4, rtol=1e-16)
+
+###################
+# Solvent model
+###################
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_631gs_solvent(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent')
+    assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_rb3lyp_631gs_solvent_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent grad')
+    assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent hessian')
+    assert np.isclose(np.linalg.norm(h), 3.8991230592666737, atol=1e-4, rtol=1e-16)
+
+# No need to test d3bj generally
+'''
+# b3lyp d3bj
+@pytest.mark.benchmark
+def test_df_rb3lyp_631gs_d3bj(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent')
+    assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7)
+@pytest.mark.benchmark
+def test_df_rb3lyp_631gs_d3bj_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent grad')
+    assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5)
+@pytest.mark.benchmark
+def test_df_rb3lyp_631gs_d3bj_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent hessian')
+    assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4)
+'''
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
new file mode 100644
index 00000000..236a433b
--- /dev/null
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -0,0 +1,100 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import pyscf
+import pytest
+from gpu4pyscf.dft import uks
+
+current_folder = os.path.dirname(os.path.abspath(__file__))
+small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
+
+def run_ub3lyp(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    return mf.kernel()
+
+def run_ub3lyp_grad(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.kernel()
+    g = mf.nuc_grad_method().kernel()
+    return g
+
+def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.conv_tol_cpscf = 1e-6
+    mf.kernel()
+    hobj = mf.Hessian()
+    if with_df:
+        hobj.auxbasis_response = 2
+    h = hobj.kernel()
+    return h
+
+##########
+# UKS
+##########
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_ub3lyp(benchmark):
+    e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_df_ub3lyp_grad(benchmark):
+    g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_df_ub3lyp_hessian(benchmark):
+    h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_ub3lyp(benchmark):
+    e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False)
+    print('testing ub3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+def test_ub3lyp_grad(benchmark):
+    g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False)
+    print('testing ub3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
+def test_ub3lyp_hessian(benchmark):
+    h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False)
+    print('testing ub3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16)
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
deleted file mode 100644
index d6f09839..00000000
--- a/gpu4pyscf/tests/test_dft.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import pyscf
-import pytest
-import cupy
-from gpu4pyscf.dft import rks, uks
-
-def setUpModule():
-    global mol
-    atom = '''
-C                 -0.07551087    1.68127663   -0.10745193
-O                  1.33621755    1.87147409   -0.39326987
-C                  1.67074668    2.95729545    0.49387976
-C                  0.41740763    3.77281969    0.78495878
-C                 -0.60481480    3.07572636    0.28906224
-H                 -0.19316298    1.01922455    0.72486113
-O                  0.35092043    5.03413298    1.45545728
-H                  0.42961487    5.74279041    0.81264173
-O                 -1.95331750    3.53349874    0.15912025
-H                 -2.55333895    2.78846397    0.23972698
-O                  2.81976302    3.20110148    0.94542226
-C                 -0.81772499    1.09230218   -1.32146482
-H                 -0.70955636    1.74951833   -2.15888136
-C                 -2.31163857    0.93420736   -0.98260166
-H                 -2.72575463    1.89080093   -0.74107186
-H                 -2.41980721    0.27699120   -0.14518512
-O                 -0.26428017   -0.18613595   -1.64425697
-H                 -0.72695910   -0.55328886   -2.40104423
-O                 -3.00083741    0.38730252   -2.10989934
-H                 -3.93210821    0.28874990   -1.89865997
-'''
-
-    mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
-
-def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
-
-class KnownValues(unittest.TestCase):
-    @pytest.mark.smoke
-    def test_b3lyp_with_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-    @pytest.mark.smoke
-    def test_b3lyp_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS(self):
-        print('------- DFUKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965349493) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
-
-    @pytest.mark.smoke
-    def test_RKS(self):
-        print('-------- RKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_UKS(self):
-        print('-------- UKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFRKS_with_SMD(self):
-        print('----- DFRKS with SMD -----')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0578838805443) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS_with_SMD(self):
-        print('------- DFUKS with SMD ---------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.05788388063) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4
-
-if __name__ == "__main__":
-    print("Full Smoke Tests")
-    unittest.main()
diff --git a/setup.py b/setup.py
index edbe56c1..c0aa6f5c 100755
--- a/setup.py
+++ b/setup.py
@@ -134,7 +134,7 @@ def initialize_with_default_plat_name(self):
     ],
     cmdclass={'build_py': CMakeBuildPy},
     install_requires=[
-        'pyscf~=2.7.0',
+        'pyscf~=2.8.0',
         'pyscf-dispersion',
         f'cupy-cuda{CUDA_VERSION}>=13.0', # Due to expm in cupyx.scipy.linalg and cutensor 2.0
         'geometric',