diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 93346e36..1bd1312d 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -4,7 +4,7 @@
 GB = 1024*1024*1024
 # such as A100-80G
 if props['totalGlobalMem'] >= 64 * GB:
-    min_ao_blksize = 256
+    min_ao_blksize = 128
     min_grid_blksize = 128*128
     ao_aligned = 32
     grid_aligned = 128
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 11890d28..771060af 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,5 +1,5 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.16'
+__version__ = '0.6.17'
 
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 14294843..f7e3217e 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -223,20 +223,21 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         nj = j1 - j0
         if sr_only:
             # TODO: in-place implementation or short-range kernel
-            ints_slices = cupy.empty([naoaux, nj, ni], order='C')
+            ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
             for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                 k0 = intopt.sph_aux_loc[cp_kl_id]
                 k1 = intopt.sph_aux_loc[cp_kl_id+1]
                 int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1])
             if omega is not None:
-                ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C')
+                ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C')
                 for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                     k0 = intopt.sph_aux_loc[cp_kl_id]
                     k1 = intopt.sph_aux_loc[cp_kl_id+1]
                     int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega)
                 ints_slices -= ints_slices_lr
         else:
-            ints_slices = cupy.empty([naoaux, nj, ni], order='C')
+            # Initialization is required due to cutensor operations later
+            ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
             for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
                 k0 = intopt.sph_aux_loc[cp_kl_id]
                 k1 = intopt.sph_aux_loc[cp_kl_id+1]
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index ce29c904..f7b531ec 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -64,7 +64,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     mo_occ = cupy.asarray(mf_grad.base.mo_occ)
     sph_ao_idx = intopt.sph_ao_idx
     dm = take_last2d(dm0, sph_ao_idx)
-    orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0]))
+    orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
     orbo = orbo[sph_ao_idx, :]
     nocc = orbo.shape[-1]
 
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 6a82e42d..42eea65c 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -376,8 +376,10 @@ def cart2sph(t, axis=0, ang=1, out=None):
     t_cart = t.reshape([i0*nli, li_size[0], i3])
     if(out is not None):
         out = out.reshape([i0*nli, li_size[1], i3])
-    t_sph = contract('min,ip->mpn', t_cart, c2s, out=out)
-    return t_sph.reshape(out_shape)
+        out[:] = cupy.einsum('min,ip->mpn', t_cart, c2s)
+    else:
+        out = cupy.einsum('min,ip->mpn', t_cart, c2s)
+    return out.reshape(out_shape)
 
 # a copy with modification from
 # https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 99dd194f..a54c1ffe 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -34,7 +34,7 @@
 except ImportError:
     cutensor = None
     CUTENSOR_ALGO_DEFAULT = None
-    
+
 def _create_mode_with_cache(mode):
     integer_mode = []
     for x in mode:
diff --git a/gpu4pyscf/lib/dftd3.py b/gpu4pyscf/lib/dftd3.py
index 85e38eae..cda54158 100644
--- a/gpu4pyscf/lib/dftd3.py
+++ b/gpu4pyscf/lib/dftd3.py
@@ -30,14 +30,19 @@
     "d3op":   libdftd3.dftd3_load_optimizedpower_damping #OptimizedPowerDampingParam,
 }
 
-libdftd3.dftd3_new_error.restype                   = ctypes.c_void_p
-libdftd3.dftd3_new_structure.restype               = ctypes.c_void_p
-libdftd3.dftd3_load_optimizedpower_damping.restype = ctypes.c_void_p
-libdftd3.dftd3_load_mzero_damping.restype          = ctypes.c_void_p
-libdftd3.dftd3_load_mrational_damping.restype      = ctypes.c_void_p
-libdftd3.dftd3_load_zero_damping.restype           = ctypes.c_void_p
-libdftd3.dftd3_load_rational_damping.restype       = ctypes.c_void_p
-libdftd3.dftd3_new_d3_model.restype                = ctypes.c_void_p
+class _d3_restype(ctypes.Structure):
+    pass
+
+_d3_p = ctypes.POINTER(_d3_restype)
+
+libdftd3.dftd3_new_error.restype                   = _d3_p
+libdftd3.dftd3_new_structure.restype               = _d3_p
+libdftd3.dftd3_load_optimizedpower_damping.restype = _d3_p
+libdftd3.dftd3_load_mzero_damping.restype          = _d3_p
+libdftd3.dftd3_load_mrational_damping.restype      = _d3_p
+libdftd3.dftd3_load_zero_damping.restype           = _d3_p
+libdftd3.dftd3_load_rational_damping.restype       = _d3_p
+libdftd3.dftd3_new_d3_model.restype                = _d3_p
 
 class DFTD3Dispersion(lib.StreamObject):
     def __init__(self, mol, xc, version='d3bj', atm=False):
@@ -64,15 +69,14 @@ def __init__(self, mol, xc, version='d3bj', atm=False):
             err,
             ctypes.create_string_buffer(xc.encode(), size=50),
             ctypes.c_bool(atm))
-        libdftd3.dftd3_delete_error(err)
+        libdftd3.dftd3_delete_error(ctypes.byref(err))
 
     def __del__(self):
         err = libdftd3.dftd3_new_error()
-        param = ctypes.cast(self._param, ctypes.c_void_p)
-        libdftd3.dftd3_delete_param(ctypes.byref(param))
-        libdftd3.dftd3_delete_structure(err, self._mol)
-        libdftd3.dftd3_delete_model(err, self._disp)
-        libdftd3.dftd3_delete_error(err)
+        libdftd3.dftd3_delete_param(ctypes.byref(self._param))
+        libdftd3.dftd3_delete_structure(err, ctypes.byref(self._mol))
+        libdftd3.dftd3_delete_model(err, ctypes.byref(self._disp))
+        libdftd3.dftd3_delete_error(ctypes.byref(err))
 
     def get_dispersion(self, grad=False):
         res = {}
@@ -102,5 +106,5 @@ def get_dispersion(self, grad=False):
             res.update(gradient=_gradient)
         if _sigma is not None:
             res.update(virial=_sigma)
-        libdftd3.dftd3_delete_error(err)
+        libdftd3.dftd3_delete_error(ctypes.byref(err))
         return res
\ No newline at end of file
diff --git a/gpu4pyscf/lib/dftd4.py b/gpu4pyscf/lib/dftd4.py
index 47defa15..61ad2bc2 100644
--- a/gpu4pyscf/lib/dftd4.py
+++ b/gpu4pyscf/lib/dftd4.py
@@ -20,10 +20,15 @@
 
 libdftd4 = np.ctypeslib.load_library('libdftd4',  os.path.abspath(os.path.join(__file__, '..', 'deps', 'lib')))
 
-libdftd4.dftd4_new_error.restype             = ctypes.c_void_p
-libdftd4.dftd4_new_structure.restype         = ctypes.c_void_p
-libdftd4.dftd4_new_d4_model.restype          = ctypes.c_void_p
-libdftd4.dftd4_load_rational_damping.restype = ctypes.c_void_p
+class _d4_restype(ctypes.Structure):
+    pass
+
+_d4_p = ctypes.POINTER(_d4_restype)
+
+libdftd4.dftd4_new_error.restype             = _d4_p
+libdftd4.dftd4_new_structure.restype         = _d4_p
+libdftd4.dftd4_new_d4_model.restype          = _d4_p
+libdftd4.dftd4_load_rational_damping.restype = _d4_p
 
 class DFTD4Dispersion(lib.StreamObject):
     def __init__(self, mol, xc, atm=False):
@@ -52,15 +57,14 @@ def __init__(self, mol, xc, atm=False):
             err,
             ctypes.create_string_buffer(xc.encode(), size=50),
             ctypes.c_bool(atm))
-        libdftd4.dftd4_delete_error(err)
+        libdftd4.dftd4_delete_error(ctypes.byref(err))
 
     def __del__(self):
         err = libdftd4.dftd4_new_error()
-        param = ctypes.cast(self._param, ctypes.c_void_p)
-        libdftd4.dftd4_delete_param(ctypes.byref(param))
-        libdftd4.dftd4_delete_structure(err, self._mol)
-        libdftd4.dftd4_delete_model(err, self._disp)
-        libdftd4.dftd4_delete_error(err)
+        libdftd4.dftd4_delete_param(ctypes.byref(self._param))
+        libdftd4.dftd4_delete_structure(err, ctypes.byref(self._mol))
+        libdftd4.dftd4_delete_model(err, ctypes.byref(self._disp))
+        libdftd4.dftd4_delete_error(ctypes.byref(err))
 
     def get_dispersion(self, grad=False):
         res = {}
@@ -90,5 +94,5 @@ def get_dispersion(self, grad=False):
             res.update(gradient=_gradient)
         if _sigma is not None:
             res.update(virial=_sigma)
-        libdftd4.dftd4_delete_error(err)
+        libdftd4.dftd4_delete_error(ctypes.byref(err))
         return res
\ No newline at end of file
diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py
index 3851acc0..5421ad39 100644
--- a/gpu4pyscf/qmmm/chelpg.py
+++ b/gpu4pyscf/qmmm/chelpg.py
@@ -230,10 +230,10 @@ def build(self, cutoff=1e-14, group_size=None,
         ncptype = len(log_qs)
 
         self.bpcache = ctypes.POINTER(BasisProdCache)()
-        if diag_block_with_triu:
-            scale_shellpair_diag = 1.
-        else:
-            scale_shellpair_diag = 0.5
+        #if diag_block_with_triu:
+        scale_shellpair_diag = 1.
+        #else:
+        #    scale_shellpair_diag = 0.5
         libgint.GINTinit_basis_prod(
             ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
             ao_loc.ctypes.data_as(ctypes.c_void_p),
diff --git a/setup.py b/setup.py
index 8ea88dd2..86cfe5d9 100755
--- a/setup.py
+++ b/setup.py
@@ -125,7 +125,7 @@ def initialize_with_default_plat_name(self):
     cmdclass={'build_py': CMakeBuildPy},
     install_requires=[
         'pyscf>=2.4.0',
-        f'cupy-cuda{CUDA_VERSION}>=12.0',
+        f'cupy-cuda{CUDA_VERSION}>=12.3',
         'geometric',
         f'gpu4pyscf-libxc-cuda{CUDA_VERSION}',
     ]