diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 93346e36..1bd1312d 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -4,7 +4,7 @@ GB = 1024*1024*1024 # such as A100-80G if props['totalGlobalMem'] >= 64 * GB: - min_ao_blksize = 256 + min_ao_blksize = 128 min_grid_blksize = 128*128 ao_aligned = 32 grid_aligned = 128 diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 11890d28..771060af 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1,5 +1,5 @@ from . import lib, grad, hessian, solvent, scf, dft -__version__ = '0.6.16' +__version__ = '0.6.17' # monkey patch libxc reference due to a bug in nvcc from pyscf.dft import libxc diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 14294843..f7e3217e 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -223,20 +223,21 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): nj = j1 - j0 if sr_only: # TODO: in-place implementation or short-range kernel - ints_slices = cupy.empty([naoaux, nj, ni], order='C') + ints_slices = cupy.zeros([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1]) if omega is not None: - ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C') + ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega) ints_slices -= ints_slices_lr else: - ints_slices = cupy.empty([naoaux, nj, ni], order='C') + # Initialization is required due to cutensor operations later + ints_slices = cupy.zeros([naoaux, nj, ni], order='C') for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index ce29c904..f7b531ec 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -64,7 +64,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega mo_occ = cupy.asarray(mf_grad.base.mo_occ) sph_ao_idx = intopt.sph_ao_idx dm = take_last2d(dm0, sph_ao_idx) - orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0])) + orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5 orbo = orbo[sph_ao_idx, :] nocc = orbo.shape[-1] diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 6a82e42d..42eea65c 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -376,8 +376,10 @@ def cart2sph(t, axis=0, ang=1, out=None): t_cart = t.reshape([i0*nli, li_size[0], i3]) if(out is not None): out = out.reshape([i0*nli, li_size[1], i3]) - t_sph = contract('min,ip->mpn', t_cart, c2s, out=out) - return t_sph.reshape(out_shape) + out[:] = cupy.einsum('min,ip->mpn', t_cart, c2s) + else: + out = cupy.einsum('min,ip->mpn', t_cart, c2s) + return out.reshape(out_shape) # a copy with modification from # https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536 diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index 99dd194f..a54c1ffe 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -34,7 +34,7 @@ except ImportError: cutensor = None CUTENSOR_ALGO_DEFAULT = None - + def _create_mode_with_cache(mode): integer_mode = [] for x in mode: diff --git a/gpu4pyscf/lib/dftd3.py b/gpu4pyscf/lib/dftd3.py index 85e38eae..cda54158 100644 --- a/gpu4pyscf/lib/dftd3.py +++ b/gpu4pyscf/lib/dftd3.py @@ -30,14 +30,19 @@ "d3op": libdftd3.dftd3_load_optimizedpower_damping #OptimizedPowerDampingParam, } -libdftd3.dftd3_new_error.restype = ctypes.c_void_p -libdftd3.dftd3_new_structure.restype = ctypes.c_void_p -libdftd3.dftd3_load_optimizedpower_damping.restype = ctypes.c_void_p -libdftd3.dftd3_load_mzero_damping.restype = ctypes.c_void_p -libdftd3.dftd3_load_mrational_damping.restype = ctypes.c_void_p -libdftd3.dftd3_load_zero_damping.restype = ctypes.c_void_p -libdftd3.dftd3_load_rational_damping.restype = ctypes.c_void_p -libdftd3.dftd3_new_d3_model.restype = ctypes.c_void_p +class _d3_restype(ctypes.Structure): + pass + +_d3_p = ctypes.POINTER(_d3_restype) + +libdftd3.dftd3_new_error.restype = _d3_p +libdftd3.dftd3_new_structure.restype = _d3_p +libdftd3.dftd3_load_optimizedpower_damping.restype = _d3_p +libdftd3.dftd3_load_mzero_damping.restype = _d3_p +libdftd3.dftd3_load_mrational_damping.restype = _d3_p +libdftd3.dftd3_load_zero_damping.restype = _d3_p +libdftd3.dftd3_load_rational_damping.restype = _d3_p +libdftd3.dftd3_new_d3_model.restype = _d3_p class DFTD3Dispersion(lib.StreamObject): def __init__(self, mol, xc, version='d3bj', atm=False): @@ -64,15 +69,14 @@ def __init__(self, mol, xc, version='d3bj', atm=False): err, ctypes.create_string_buffer(xc.encode(), size=50), ctypes.c_bool(atm)) - libdftd3.dftd3_delete_error(err) + libdftd3.dftd3_delete_error(ctypes.byref(err)) def __del__(self): err = libdftd3.dftd3_new_error() - param = ctypes.cast(self._param, ctypes.c_void_p) - libdftd3.dftd3_delete_param(ctypes.byref(param)) - libdftd3.dftd3_delete_structure(err, self._mol) - libdftd3.dftd3_delete_model(err, self._disp) - libdftd3.dftd3_delete_error(err) + libdftd3.dftd3_delete_param(ctypes.byref(self._param)) + libdftd3.dftd3_delete_structure(err, ctypes.byref(self._mol)) + libdftd3.dftd3_delete_model(err, ctypes.byref(self._disp)) + libdftd3.dftd3_delete_error(ctypes.byref(err)) def get_dispersion(self, grad=False): res = {} @@ -102,5 +106,5 @@ def get_dispersion(self, grad=False): res.update(gradient=_gradient) if _sigma is not None: res.update(virial=_sigma) - libdftd3.dftd3_delete_error(err) + libdftd3.dftd3_delete_error(ctypes.byref(err)) return res \ No newline at end of file diff --git a/gpu4pyscf/lib/dftd4.py b/gpu4pyscf/lib/dftd4.py index 47defa15..61ad2bc2 100644 --- a/gpu4pyscf/lib/dftd4.py +++ b/gpu4pyscf/lib/dftd4.py @@ -20,10 +20,15 @@ libdftd4 = np.ctypeslib.load_library('libdftd4', os.path.abspath(os.path.join(__file__, '..', 'deps', 'lib'))) -libdftd4.dftd4_new_error.restype = ctypes.c_void_p -libdftd4.dftd4_new_structure.restype = ctypes.c_void_p -libdftd4.dftd4_new_d4_model.restype = ctypes.c_void_p -libdftd4.dftd4_load_rational_damping.restype = ctypes.c_void_p +class _d4_restype(ctypes.Structure): + pass + +_d4_p = ctypes.POINTER(_d4_restype) + +libdftd4.dftd4_new_error.restype = _d4_p +libdftd4.dftd4_new_structure.restype = _d4_p +libdftd4.dftd4_new_d4_model.restype = _d4_p +libdftd4.dftd4_load_rational_damping.restype = _d4_p class DFTD4Dispersion(lib.StreamObject): def __init__(self, mol, xc, atm=False): @@ -52,15 +57,14 @@ def __init__(self, mol, xc, atm=False): err, ctypes.create_string_buffer(xc.encode(), size=50), ctypes.c_bool(atm)) - libdftd4.dftd4_delete_error(err) + libdftd4.dftd4_delete_error(ctypes.byref(err)) def __del__(self): err = libdftd4.dftd4_new_error() - param = ctypes.cast(self._param, ctypes.c_void_p) - libdftd4.dftd4_delete_param(ctypes.byref(param)) - libdftd4.dftd4_delete_structure(err, self._mol) - libdftd4.dftd4_delete_model(err, self._disp) - libdftd4.dftd4_delete_error(err) + libdftd4.dftd4_delete_param(ctypes.byref(self._param)) + libdftd4.dftd4_delete_structure(err, ctypes.byref(self._mol)) + libdftd4.dftd4_delete_model(err, ctypes.byref(self._disp)) + libdftd4.dftd4_delete_error(ctypes.byref(err)) def get_dispersion(self, grad=False): res = {} @@ -90,5 +94,5 @@ def get_dispersion(self, grad=False): res.update(gradient=_gradient) if _sigma is not None: res.update(virial=_sigma) - libdftd4.dftd4_delete_error(err) + libdftd4.dftd4_delete_error(ctypes.byref(err)) return res \ No newline at end of file diff --git a/gpu4pyscf/qmmm/chelpg.py b/gpu4pyscf/qmmm/chelpg.py index 3851acc0..5421ad39 100644 --- a/gpu4pyscf/qmmm/chelpg.py +++ b/gpu4pyscf/qmmm/chelpg.py @@ -230,10 +230,10 @@ def build(self, cutoff=1e-14, group_size=None, ncptype = len(log_qs) self.bpcache = ctypes.POINTER(BasisProdCache)() - if diag_block_with_triu: - scale_shellpair_diag = 1. - else: - scale_shellpair_diag = 0.5 + #if diag_block_with_triu: + scale_shellpair_diag = 1. + #else: + # scale_shellpair_diag = 0.5 libgint.GINTinit_basis_prod( ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag), ao_loc.ctypes.data_as(ctypes.c_void_p), diff --git a/setup.py b/setup.py index 8ea88dd2..86cfe5d9 100755 --- a/setup.py +++ b/setup.py @@ -125,7 +125,7 @@ def initialize_with_default_plat_name(self): cmdclass={'build_py': CMakeBuildPy}, install_requires=[ 'pyscf>=2.4.0', - f'cupy-cuda{CUDA_VERSION}>=12.0', + f'cupy-cuda{CUDA_VERSION}>=12.3', 'geometric', f'gpu4pyscf-libxc-cuda{CUDA_VERSION}', ]