Skip to content

Commit

Permalink
small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
wxj6000 committed Jan 5, 2025
1 parent e5ddd9a commit e85afa6
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 68 deletions.
16 changes: 16 additions & 0 deletions benchmarks/cupy_helper/benchmark_memory_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,19 @@ def cupy_asarray_contiguous(a, b):
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

assert np.linalg.norm(a.get() - b.get()) < 1e-10


print('----------- Benchmark reduction across devices ------ ')
from gpu4pyscf.lib.cupy_helper import reduce_to_device
_num_devices = cp.cuda.runtime.getDeviceCount()
a_dist = []
for device_id in range(_num_devices):
with cp.cuda.Device(device_id):
a = cp.random.rand(512,512,512)
a_dist.append(a)

perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
print('Cupy set contiguous array', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
4 changes: 2 additions & 2 deletions examples/dft_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@
basis=bas,
max_memory=32000)
# set verbose >= 6 for debugging timer
mol.verbose = 6
mol.verbose = 4

mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
mf_df.verbose = 6
mf_df.verbose = 4

if args.solvent:
mf_df = mf_df.PCM()
Expand Down
3 changes: 2 additions & 1 deletion gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
cart2sph = intopt.cart2sph
orbo_cart = cart2sph @ orbo
dm_cart = cart2sph @ dm @ cart2sph.T


with_df._cderi = None # release GPU memory
vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
with_j=with_j, with_k=with_k, omega=omega)
# NOTE: vj and vk are still in cartesian
Expand Down
54 changes: 1 addition & 53 deletions gpu4pyscf/df/grad/uhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,59 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
orbo_cart = orbo
dm = orbo = None

"""
vj = vk = rhoj_tmp = rhok_tmp = None
vjaux = vkaux = None
naux_cart = intopt._sorted_auxmol.nao
if with_j:
vj = cupy.zeros((3,nao_cart), order='C')
vjaux = cupy.zeros((3,naux_cart))
if with_k:
vk = cupy.zeros((3,nao_cart), order='C')
vkaux = cupy.zeros((3,naux_cart))
cupy.get_default_memory_pool().free_all_blocks()
t1 = log.init_timer()
for cp_kl_id in range(len(intopt.aux_log_qs)):
k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
assert k1-k0 <= block_size
if with_j:
rhoj_tmp = rhoj_cart[k0:k1]
if with_k:
rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
'''
if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
'''
'''
# outcore implementation
int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf)
size = 3*(k1-k0)*nao_cart*nao_cart
int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart)
vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1])
vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip)
int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf)
rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart)
vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1])
vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
'''
vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
if with_j: vj += vj_tmp
if with_k: vk += vk_tmp
vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
if with_j: vjaux[:, k0:k1] = vj_tmp
if with_k: vkaux[:, k0:k1] = vk_tmp
rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
"""
with_df._cderi = None # release GPU memory
vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
with_j=with_j, with_k=with_k, omega=omega)

Expand Down
7 changes: 3 additions & 4 deletions gpu4pyscf/df/hessian/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
int2c = cupy.asarray(int2c, order='C')
int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
solve_j2c = _gen_metric_solver(int2c)

# int3c contributions
wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
rhoj0_P = rhok0_P__ = None
Expand Down Expand Up @@ -172,10 +172,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
mem_avail = get_avail_mem()
nocc = mocc.shape[1]
slice_size = naux*nocc*9 # largest slice of intermediate variables
blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED
blksize = int(mem_avail*0.2/8/slice_size)
log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
if blksize < ALIGNED:
raise RuntimeError('Not enough memory for intermediate variables')
assert blksize > 0
if hessobj.auxbasis_response:
hk_ao_aux = cupy.zeros([nao,naux,3,3])
for i0, i1 in lib.prange(0,nao,blksize):
Expand Down
29 changes: 21 additions & 8 deletions gpu4pyscf/dft/numint.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,

ngrids_glob = grids.coords.shape[0]
ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = device_id * ngrids_per_device
grid_end = (device_id + 1) * ngrids_per_device
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
ngrids_local = grid_end - grid_start
log.debug(f"{ngrids_local} on Device {device_id}")

weights = cupy.empty([ngrids_local])
if xctype == 'LDA':
Expand All @@ -425,16 +427,18 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
rho_tot = cupy.empty([nset,4,ngrids_local])
else:
rho_tot = cupy.empty([nset,5,ngrids_local])

p0 = p1 = 0
for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
max_memory=None,
grid_range=(grid_start, grid_end)):
p1 = p0 + weight.size
weights[p0:p1] = weight
for i in range(nset):
if mo_coeff is None:
rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx],
# If AO is sparse enough, use density matrix to calculate rho
if mo_coeff is None or len(idx) < mo_occ.sum():
dms_mask = dms[i][idx[:,None],idx]
rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask,
xctype=xctype, hermi=hermi, with_lapl=with_lapl)
else:
assert hermi == 1
Expand All @@ -443,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
None, xctype, with_lapl)
p0 = p1
t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0)

# libxc calls are still running on default stream
nelec = cupy.zeros(nset)
excsum = cupy.zeros(nset)
Expand Down Expand Up @@ -814,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,

ngrids_glob = grids.coords.shape[0]
ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = device_id * ngrids_per_device
grid_end = (device_id + 1) * ngrids_per_device
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
ngrids_local = grid_end - grid_start
log.debug(f"{ngrids_local} on Device {device_id}")

for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
max_memory=None,
Expand Down Expand Up @@ -1016,8 +1023,11 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,

ngrids_glob = grids.coords.shape[0]
ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = device_id * ngrids_per_device
grid_end = (device_id + 1) * ngrids_per_device
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
ngrids_local = grid_end - grid_start
log.debug(f"{ngrids_local} on Device {device_id}")

p0 = p1 = grid_start
t1 = t0 = log.init_timer()
Expand Down Expand Up @@ -1165,8 +1175,11 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,

ngrids_glob = grids.coords.shape[0]
ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = device_id * ngrids_per_device
grid_end = (device_id + 1) * ngrids_per_device
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
ngrids_local = grid_end - grid_start
log.debug(f"{ngrids_local} on Device {device_id}")

p0 = p1 = grid_start
t1 = t0 = log.init_timer()
Expand Down

0 comments on commit e85afa6

Please sign in to comment.