Skip to content

Commit

Permalink
Merge branch 'master' into int1e_1st_derivative
Browse files Browse the repository at this point in the history
  • Loading branch information
henryw7 committed Dec 12, 2024
2 parents db718d0 + 010ca2d commit 213331e
Show file tree
Hide file tree
Showing 99 changed files with 51,328 additions and 54,685 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ Features
- MP2/DF-MP2 and CCSD (experimental);
- Polarizability, IR, and NMR shielding (experimental);
- QM/MM with PBC;
- CHELPG, ESP, and RESP atomic charge
- CHELPG, ESP, and RESP atomic charge;
- Multi-GPU for density fitting (experimental)

Limitations
--------
Expand Down Expand Up @@ -134,22 +135,22 @@ References
---------
```
@misc{li2024introducting,
title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework},
title={Introducing GPU-acceleration into the Python-based Simulations of Chemistry Framework},
author={Rui Li and Qiming Sun and Xing Zhang and Garnet Kin-Lic Chan},
year={2024},
eprint={2407.09700},
archivePrefix={arXiv},
primaryClass={physics.comp-ph},
url={https://arxiv.org/abs/2407.09700},
url={https://arxiv.org/abs/2407.09700},
}
@misc{wu2024enhancing,
title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework},
title={Enhancing GPU-acceleration in the Python-based Simulations of Chemistry Framework},
author={Xiaojie Wu and Qiming Sun and Zhichen Pu and Tianze Zheng and Wenzhi Ma and Wen Yan and Xia Yu and Zhengxiao Wu and Mian Huo and Xiang Li and Weiluo Ren and Sheng Gong and Yumin Zhang and Weihao Gao},
year={2024},
eprint={2404.09452},
archivePrefix={arXiv},
primaryClass={physics.comp-ph},
url={https://arxiv.org/abs/2404.09452},
url={https://arxiv.org/abs/2404.09452},
}
```
4 changes: 2 additions & 2 deletions examples/00-h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@
atom=atom, # water molecule
basis='def2-tzvpp', # basis set
output='./pyscf.log', # save log file
verbose=6 # control the level of print info
verbose=6 # control the level of print info
)

mf_GPU = rks.RKS( # restricted Kohn-Sham DFT
mol, # pyscf.gto.object
xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss,
xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss,
).density_fit() # density fitting

mf_GPU.grids.atom_grid = (99,590) # (99,590) lebedev grids, (75,302) is often enough
Expand Down
8 changes: 8 additions & 0 deletions gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@
mem_fraction = 0.9
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)

# Check P2P data transfer is available
_p2p_access = True
if _num_devices > 1:
for src in range(_num_devices):
for dst in range(_num_devices):
if src != dst:
can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
_p2p_access &= can_access_peer
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.1.0'
__version__ = '1.2.0'

# monkey patch libxc reference due to a bug in nvcc
from pyscf.dft import libxc
Expand Down
32 changes: 14 additions & 18 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cupyx.scipy.linalg import solve_triangular
from pyscf import lib
from pyscf.df import df, addons, incore
from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph
from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
Expand Down Expand Up @@ -123,7 +123,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
if key in self._rsh_df:
rsh_df = self._rsh_df[key]
else:
rsh_df = self._rsh_df[key] = copy.copy(self).reset()
rsh_df = self._rsh_df[key] = self.copy().reset()
logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)

return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)
Expand Down Expand Up @@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
yield buf2, buf.T
if isinstance(cderi_sparse, np.ndarray):
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch

def reset(self, mol=None):
'''Reset mol and clean up relevant attributes for scanner mode'''
if mol is not None:
Expand Down Expand Up @@ -208,13 +208,14 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
npairs = len(intopt.cderi_row)
log = logger.new_logger(mol, mol.verbose)

# if the matrix exceeds the limit, store CDERI in CPU memory
# TODO: better estimate of memory consumption for each device
# Available memory on Device 0.
avail_mem = get_avail_mem()

if use_gpu_memory:
# If GPU memory is not enough
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem
# CDERI will be equally distributed to the devices
# Other devices usually have more memory available than Device 0
# CDERI will use up to 40% of the available memory
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices

if use_gpu_memory:
log.debug("Saving CDERI on GPU")
Expand Down Expand Up @@ -244,9 +245,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
cd_low_f = cupy.array(cd_low, order='F', copy=False)
cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)

for gpu_id in range(_num_devices):
cupy.cuda.Device(gpu_id).synchronize()

cupy.cuda.get_current_stream().synchronize()
futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
Expand All @@ -258,9 +257,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
for future in futures:
future.result()

for device_id in range(_num_devices):
cupy.cuda.Device(device_id).synchronize()

if not use_gpu_memory:
cupy.cuda.Device().synchronize()

Expand Down Expand Up @@ -344,14 +340,14 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
# if CDERI is saved on CPU
ij0 = pairs_loc[cp_ij_id]
ij1 = pairs_loc[cp_ij_id+1]
if isinstance(_cderi, np.ndarray):
if isinstance(_cderi[0], np.ndarray):
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
for i in range(p0,p1):
cderi_block[i].get(out=_cderi[slice_id][i,ij0:ij1])
cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
else:
# Copy data to other Devices
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]

#_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
return
23 changes: 11 additions & 12 deletions gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def build_df():
if key in mf.with_df._rsh_df:
rsh_df = mf.with_df._rsh_df[key]
else:
rsh_df = mf.with_df._rsh_df[key] = copy.copy(mf.with_df).reset()
rsh_df = mf.with_df._rsh_df[key] = mf.with_df.copy().reset()
rsh_df.build(omega=omega)
return

Expand Down Expand Up @@ -101,7 +101,7 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
mf.with_df = with_df
elif getattr(mf.with_df, 'auxbasis', None) != auxbasis:
#logger.warn(mf, 'DF might have been initialized twice.')
mf = copy.copy(mf)
mf = mf.copy()
mf.with_df = with_df
mf.only_dfj = only_dfj
return mf
Expand Down Expand Up @@ -298,8 +298,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
rhok = rhok.reshape([-1,nao])
vk[i] += cupy.dot(rhok.T, rhok)
rhok = None
cupy.cuda.get_current_stream().synchronize()


if with_j:
vj = cupy.zeros(dms_shape)
vj[:,rows,cols] = vj_packed
Expand Down Expand Up @@ -390,13 +389,12 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
else:
dm_sparse *= 2
dm_sparse[:, intopt.cderi_diag] *= .5

vj_sparse = cupy.zeros_like(dm_sparse)

if with_k:
vk = cupy.zeros_like(dms)

nset = dms.shape[0]
if with_j:
vj_sparse = cupy.zeros_like(dm_sparse)
blksize = dfobj.get_blksize()
for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
if with_j:
Expand All @@ -406,7 +404,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
for k in range(nset):
rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
#vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
if with_j:
vj = cupy.zeros(dms_shape)
vj[:,rows,cols] = vj_sparse
Expand Down Expand Up @@ -445,6 +443,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
intopt = dfobj.intopt
dms = intopt.sort_orbitals(dms, axis=[1,2])

cupy.cuda.get_current_stream().synchronize()
if getattr(dms_tag, 'mo_coeff', None) is not None:
mo_occ = dms_tag.mo_occ
mo_coeff = dms_tag.mo_coeff
Expand Down Expand Up @@ -498,13 +497,13 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
vj = vk = None
if with_j:
vj = [future.result()[0] for future in futures]
vj = reduce_to_device(vj)
vj = reduce_to_device(vj, inplace=True)
vj = intopt.unsort_orbitals(vj, axis=[1,2])
vj = vj.reshape(out_shape)

if with_k:
vk = [future.result()[1] for future in futures]
vk = reduce_to_device(vk)
vk = reduce_to_device(vk, inplace=True)
vk = intopt.unsort_orbitals(vk, axis=[1,2])
vk = vk.reshape(out_shape)

Expand All @@ -529,7 +528,7 @@ def _get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True,
if key in dfobj._rsh_df:
rsh_df = dfobj._rsh_df[key]
else:
rsh_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset()
logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)

with rsh_df.mol.with_range_coulomb(omega):
Expand Down
7 changes: 4 additions & 3 deletions gpu4pyscf/df/grad/jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from concurrent.futures import ThreadPoolExecutor
import cupy
from gpu4pyscf.lib.cupy_helper import contract
from gpu4pyscf.lib.cupy_helper import contract, concatenate
from gpu4pyscf.lib import logger
from gpu4pyscf.__config__ import _streams, _num_devices

Expand Down Expand Up @@ -58,6 +58,7 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
''' Calculate rhoj and rhok on Multi-GPU system
'''
futures = []
cupy.cuda.get_current_stream().synchronize()
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
future = executor.submit(
Expand All @@ -74,8 +75,8 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):

rhoj = rhok = None
if with_j:
rhoj = cupy.concatenate(rhoj_total)
rhoj = concatenate(rhoj_total)
if with_k:
rhok = cupy.concatenate(rhok_total)
rhok = concatenate(rhok_total)

return rhoj, rhok
4 changes: 2 additions & 2 deletions gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
with_df = mf_grad.base.with_df._rsh_df[key]
else:
dfobj = mf_grad.base.with_df
with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
with_df = dfobj._rsh_df[key] = dfobj.copy().reset()

auxmol = with_df.auxmol
if not hasattr(with_df, 'intopt') or with_df._cderi is None:
Expand Down Expand Up @@ -282,4 +282,4 @@ def extra_force(self, atom_id, envs):
else:
return 0

Grad = Gradients
Grad = Gradients
2 changes: 1 addition & 1 deletion gpu4pyscf/df/grad/uhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
with_df = mf_grad.base.with_df._rsh_df[key]
else:
dfobj = mf_grad.base.with_df
with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
with_df = dfobj._rsh_df[key] = dfobj.copy().reset()

auxmol = with_df.auxmol
if not hasattr(with_df, 'intopt') or with_df._cderi is None:
Expand Down
Loading

0 comments on commit 213331e

Please sign in to comment.