Skip to content

Commit

Permalink
fixed memory leak in dftd3, debug for A800
Browse files Browse the repository at this point in the history
  • Loading branch information
wxj6000 committed Jan 18, 2024
1 parent beb05b9 commit e0e1dd6
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 40 deletions.
2 changes: 1 addition & 1 deletion gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
GB = 1024*1024*1024
# such as A100-80G
if props['totalGlobalMem'] >= 64 * GB:
min_ao_blksize = 256
min_ao_blksize = 128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 128
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from . import lib, grad, hessian, solvent, scf, dft
__version__ = '0.6.16'
__version__ = '0.6.17'

# monkey patch libxc reference due to a bug in nvcc
from pyscf.dft import libxc
Expand Down
7 changes: 4 additions & 3 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,20 +223,21 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
nj = j1 - j0
if sr_only:
# TODO: in-place implementation or short-range kernel
ints_slices = cupy.empty([naoaux, nj, ni], order='C')
ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1])
if omega is not None:
ints_slices_lr = cupy.empty([naoaux, nj, ni], order='C')
ints_slices_lr = cupy.zeros([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega)
ints_slices -= ints_slices_lr
else:
ints_slices = cupy.empty([naoaux, nj, ni], order='C')
# Initialization is required due to cutensor operations later
ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
k1 = intopt.sph_aux_loc[cp_kl_id+1]
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
mo_occ = cupy.asarray(mf_grad.base.mo_occ)
sph_ao_idx = intopt.sph_ao_idx
dm = take_last2d(dm0, sph_ao_idx)
orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0]))
orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
orbo = orbo[sph_ao_idx, :]
nocc = orbo.shape[-1]

Expand Down
6 changes: 4 additions & 2 deletions gpu4pyscf/lib/cupy_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,10 @@ def cart2sph(t, axis=0, ang=1, out=None):
t_cart = t.reshape([i0*nli, li_size[0], i3])
if(out is not None):
out = out.reshape([i0*nli, li_size[1], i3])
t_sph = contract('min,ip->mpn', t_cart, c2s, out=out)
return t_sph.reshape(out_shape)
out[:] = cupy.einsum('min,ip->mpn', t_cart, c2s)
else:
out = cupy.einsum('min,ip->mpn', t_cart, c2s)
return out.reshape(out_shape)

# a copy with modification from
# https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/lib/cutensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
except ImportError:
cutensor = None
CUTENSOR_ALGO_DEFAULT = None

def _create_mode_with_cache(mode):
integer_mode = []
for x in mode:
Expand Down
34 changes: 19 additions & 15 deletions gpu4pyscf/lib/dftd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,19 @@
"d3op": libdftd3.dftd3_load_optimizedpower_damping #OptimizedPowerDampingParam,
}

libdftd3.dftd3_new_error.restype = ctypes.c_void_p
libdftd3.dftd3_new_structure.restype = ctypes.c_void_p
libdftd3.dftd3_load_optimizedpower_damping.restype = ctypes.c_void_p
libdftd3.dftd3_load_mzero_damping.restype = ctypes.c_void_p
libdftd3.dftd3_load_mrational_damping.restype = ctypes.c_void_p
libdftd3.dftd3_load_zero_damping.restype = ctypes.c_void_p
libdftd3.dftd3_load_rational_damping.restype = ctypes.c_void_p
libdftd3.dftd3_new_d3_model.restype = ctypes.c_void_p
class _d3_restype(ctypes.Structure):
pass

_d3_p = ctypes.POINTER(_d3_restype)

libdftd3.dftd3_new_error.restype = _d3_p
libdftd3.dftd3_new_structure.restype = _d3_p
libdftd3.dftd3_load_optimizedpower_damping.restype = _d3_p
libdftd3.dftd3_load_mzero_damping.restype = _d3_p
libdftd3.dftd3_load_mrational_damping.restype = _d3_p
libdftd3.dftd3_load_zero_damping.restype = _d3_p
libdftd3.dftd3_load_rational_damping.restype = _d3_p
libdftd3.dftd3_new_d3_model.restype = _d3_p

class DFTD3Dispersion(lib.StreamObject):
def __init__(self, mol, xc, version='d3bj', atm=False):
Expand All @@ -64,15 +69,14 @@ def __init__(self, mol, xc, version='d3bj', atm=False):
err,
ctypes.create_string_buffer(xc.encode(), size=50),
ctypes.c_bool(atm))
libdftd3.dftd3_delete_error(err)
libdftd3.dftd3_delete_error(ctypes.byref(err))

def __del__(self):
err = libdftd3.dftd3_new_error()
param = ctypes.cast(self._param, ctypes.c_void_p)
libdftd3.dftd3_delete_param(ctypes.byref(param))
libdftd3.dftd3_delete_structure(err, self._mol)
libdftd3.dftd3_delete_model(err, self._disp)
libdftd3.dftd3_delete_error(err)
libdftd3.dftd3_delete_param(ctypes.byref(self._param))
libdftd3.dftd3_delete_structure(err, ctypes.byref(self._mol))
libdftd3.dftd3_delete_model(err, ctypes.byref(self._disp))
libdftd3.dftd3_delete_error(ctypes.byref(err))

def get_dispersion(self, grad=False):
res = {}
Expand Down Expand Up @@ -102,5 +106,5 @@ def get_dispersion(self, grad=False):
res.update(gradient=_gradient)
if _sigma is not None:
res.update(virial=_sigma)
libdftd3.dftd3_delete_error(err)
libdftd3.dftd3_delete_error(ctypes.byref(err))
return res
26 changes: 15 additions & 11 deletions gpu4pyscf/lib/dftd4.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@

libdftd4 = np.ctypeslib.load_library('libdftd4', os.path.abspath(os.path.join(__file__, '..', 'deps', 'lib')))

libdftd4.dftd4_new_error.restype = ctypes.c_void_p
libdftd4.dftd4_new_structure.restype = ctypes.c_void_p
libdftd4.dftd4_new_d4_model.restype = ctypes.c_void_p
libdftd4.dftd4_load_rational_damping.restype = ctypes.c_void_p
class _d4_restype(ctypes.Structure):
pass

_d4_p = ctypes.POINTER(_d4_restype)

libdftd4.dftd4_new_error.restype = _d4_p
libdftd4.dftd4_new_structure.restype = _d4_p
libdftd4.dftd4_new_d4_model.restype = _d4_p
libdftd4.dftd4_load_rational_damping.restype = _d4_p

class DFTD4Dispersion(lib.StreamObject):
def __init__(self, mol, xc, atm=False):
Expand Down Expand Up @@ -52,15 +57,14 @@ def __init__(self, mol, xc, atm=False):
err,
ctypes.create_string_buffer(xc.encode(), size=50),
ctypes.c_bool(atm))
libdftd4.dftd4_delete_error(err)
libdftd4.dftd4_delete_error(ctypes.byref(err))

def __del__(self):
err = libdftd4.dftd4_new_error()
param = ctypes.cast(self._param, ctypes.c_void_p)
libdftd4.dftd4_delete_param(ctypes.byref(param))
libdftd4.dftd4_delete_structure(err, self._mol)
libdftd4.dftd4_delete_model(err, self._disp)
libdftd4.dftd4_delete_error(err)
libdftd4.dftd4_delete_param(ctypes.byref(self._param))
libdftd4.dftd4_delete_structure(err, ctypes.byref(self._mol))
libdftd4.dftd4_delete_model(err, ctypes.byref(self._disp))
libdftd4.dftd4_delete_error(ctypes.byref(err))

def get_dispersion(self, grad=False):
res = {}
Expand Down Expand Up @@ -90,5 +94,5 @@ def get_dispersion(self, grad=False):
res.update(gradient=_gradient)
if _sigma is not None:
res.update(virial=_sigma)
libdftd4.dftd4_delete_error(err)
libdftd4.dftd4_delete_error(ctypes.byref(err))
return res
8 changes: 4 additions & 4 deletions gpu4pyscf/qmmm/chelpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,10 @@ def build(self, cutoff=1e-14, group_size=None,
ncptype = len(log_qs)

self.bpcache = ctypes.POINTER(BasisProdCache)()
if diag_block_with_triu:
scale_shellpair_diag = 1.
else:
scale_shellpair_diag = 0.5
#if diag_block_with_triu:
scale_shellpair_diag = 1.
#else:
# scale_shellpair_diag = 0.5
libgint.GINTinit_basis_prod(
ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
ao_loc.ctypes.data_as(ctypes.c_void_p),
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def initialize_with_default_plat_name(self):
cmdclass={'build_py': CMakeBuildPy},
install_requires=[
'pyscf>=2.4.0',
f'cupy-cuda{CUDA_VERSION}>=12.0',
f'cupy-cuda{CUDA_VERSION}>=12.3',
'geometric',
f'gpu4pyscf-libxc-cuda{CUDA_VERSION}',
]
Expand Down

0 comments on commit e0e1dd6

Please sign in to comment.