Skip to content

Commit

Permalink
Benchmark in nightly build (#295)
Browse files Browse the repository at this point in the history
* refactor hessian class

* fixed bug in df.hessian.uhf

* update license

* format code

* support h function in hessian.jk

* unit test

* optimize df hessian memory usage

* more accurate memory estimate for hessian

* _gen_jk -> _get_jk_ip

* with_j and with_k for hessian

* memory estimate

* tested on 095 molecule

* improve make_h1 in df.hessian

* bugfix

* use sorted_mol

* update nightly build

* assert hermi==1

* typo in uhf.hessian

* inject gen_response into soscf

* update tests for nightly build

* disable benchmark for ci

* install pytest-benchmark

* change the file names of benchmark tests

* disable benchmark for ci

* test dir

* save changes

* add copy_array

* assert chunk_shape

* improve hcore derivatives

* cupy copy -> copy_array

* optimize multi-GPU

* bugfix for single gpu

* update benchmark script

* np.isclose

* bugfix

* auxbasis_response

* add benchmark results

* split nightly benchmark

* optimize df.hessian memory

* small fixes

* bugfix in df.hessian

* bugfix

* add benchmark data

* remove comments

* resolve comments

* group_size in hessian

* resolve possible memory leak

* bugfix

* bugfix
  • Loading branch information
wxj6000 authored Jan 7, 2025
1 parent e55a70e commit 49f2f56
Show file tree
Hide file tree
Showing 35 changed files with 2,910 additions and 774 deletions.
12 changes: 9 additions & 3 deletions .github/workflows/nightly_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ permissions:
jobs:
build:

runs-on: self-hosted
runs-on: [self-hosted, Linux, X64, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -23,6 +23,7 @@ jobs:
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip
pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
pip3 install pytest-benchmark
pip3 install pyscf --upgrade
pip3 install numpy --upgrade
pip3 install scipy --upgrade
Expand All @@ -35,8 +36,13 @@ jobs:
export PATH=${CUDA_HOME}/bin:${PATH}
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
sh build.sh
- name: Smoke Test
- name: Test RKS
run: |
echo $GITHUB_WORKSPACE
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest --durations=0
pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
- name: Test UKS
run: |
echo $GITHUB_WORKSPACE
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
6 changes: 4 additions & 2 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
run: |
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip
pip3 install pytest-benchmark
pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
pip3 install pyscf --upgrade
pip3 install git+https://github.com/pyscf/properties --upgrade
Expand All @@ -38,7 +39,7 @@ jobs:
run: |
echo $GITHUB_WORKSPACE
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
multi-gpu:
runs-on: [self-hosted, Linux, X64, 2T4]
Expand All @@ -48,6 +49,7 @@ jobs:
run: |
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip
pip3 install pytest-benchmark
pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
pip3 install pyscf --upgrade
pip3 install git+https://github.com/pyscf/properties --upgrade
Expand All @@ -65,4 +67,4 @@ jobs:
run: |
echo $GITHUB_WORKSPACE
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
**/build
**/launch_logs
**/deps
**/.benchmarks
core
**tmp*
*.egg-info/
Expand Down
141 changes: 141 additions & 0 deletions benchmarks/cupy_helper/benchmark_memory_copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import numpy as np
import cupy as cp
from cupyx import profiler
from gpu4pyscf.lib.cupy_helper import copy_array

'''
Benchmark different ways of transfering data from pinned memory to device
'''

# Host array
host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8)
big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array)
big_host_data = big_host_data.reshape(512,512,512)
big_host_data += np.random.rand(512,512,512)

# Device array
big_device_data = cp.empty_like(big_host_data)

# Create views on both arrays
host_view = big_host_data[:, 128:] # Non-contiguous view on the host
device_view = big_device_data[:, 128:] # Non-contiguous view on the device

print("Host View Shape:", host_view.shape)
print("Device View Shape:", device_view.shape)

print("------ Benchmark device to host transfer ----------")
size = host_view.nbytes
perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3)
t_kernel = perf_custom.gpu_times.mean()
bandwidth = size / t_kernel / 1e9
print('Using custom function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

def cupy_copy(c, out):
out[:] = cp.asarray(c)
return out
perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = size / t_kernel / 1e9
print('Using cupy function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

print("------- Benchmark host to device transfer ---------")
size = host_view.nbytes
perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3)
t_kernel = perf_custom.gpu_times.mean()
bandwidth = size / t_kernel / 1e9
print('Using custom function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

def cupy_copy(c, out):
out[:] = c.get()
return out
perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = size / t_kernel / 1e9
print('Using cupy function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

print("-------- Benchmark device to device transfer (non-contiguous) ---------")

with cp.cuda.Device(0):
a = cp.random.rand(512,512,512)
device0_view = a[:,128:]
with cp.cuda.Device(1):
b = cp.random.rand(512,512,512)
device1_view = b[:,128:]
perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = device0_view.nbytes / t_kernel / 1e9
print('Using custom function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10

def cupy_copy(c, out):
with cp.cuda.Device(out.device):
out[:] = cp.asarray(c.get())
return out
perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = device0_view.nbytes / t_kernel / 1e9
print('Using cupy function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

print("-------- Benchmark device to device transfer (contiguous) ---------")
perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = device0_view.nbytes / t_kernel / 1e9
print('Using custom function', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

def cupy_copy_contiguous(a, b):
b[:] = a
perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = device0_view.nbytes / t_kernel / 1e9
print('Cupy copy contiguous array', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

def cupy_asarray_contiguous(a, b):
with cp.cuda.Device(b.device):
b = cp.asarray(a)
perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = device0_view.nbytes / t_kernel / 1e9
print('Cupy set contiguous array', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")

assert np.linalg.norm(a.get() - b.get()) < 1e-10


print('----------- Benchmark reduction across devices ------ ')
from gpu4pyscf.lib.cupy_helper import reduce_to_device
_num_devices = cp.cuda.runtime.getDeviceCount()
a_dist = []
for device_id in range(_num_devices):
with cp.cuda.Device(device_id):
a = cp.random.rand(512,512,512)
a_dist.append(a)

perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
t_kernel = perf_cupy.gpu_times.mean()
bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
print('Cupy set contiguous array', t_kernel)
print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
1 change: 0 additions & 1 deletion examples/dft_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
parser.add_argument("--solvent", type=str, default='')
args = parser.parse_args()

lib.num_threads(16)
start_time = time.time()
bas = args.basis
mol = pyscf.M(
Expand Down
43 changes: 26 additions & 17 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
from cupyx.scipy.linalg import solve_triangular
from pyscf import lib
from pyscf.df import df, addons, incore
from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem,
cart2sph, p2p_transfer, copy_array)
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
Expand Down Expand Up @@ -142,8 +143,7 @@ def get_blksize(self, extra=0, nao=None):
log = logger.new_logger(self.mol, self.mol.verbose)
device_id = cupy.cuda.Device().id
log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}")
if blksize < ALIGNED:
raise RuntimeError("Not enough GPU memory")
assert blksize > 0
return blksize

def loop(self, blksize=None, unpack=True):
Expand Down Expand Up @@ -226,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
log.debug("Saving CDERI on CPU")

_cderi = {}
blksize = (naux + _num_devices - 1) // _num_devices
for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
aux_blksize = (naux + _num_devices - 1) // _num_devices
aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
for device_id in range(_num_devices):
p0 = min(aux_blksize*device_id, naux)
p1 = min(aux_blksize*(device_id+1), naux)
#for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
if use_gpu_memory:
with cupy.cuda.Device(device_id), _streams[device_id]:
_cderi[device_id] = cupy.empty([p1-p0, npairs])
log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}")
log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
else:
mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8)
cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem)
Expand All @@ -253,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
task_list = task_list_per_device[device_id]
future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
omega=omega, sr_only=sr_only, device_id=device_id)
futures.append(future)

Expand All @@ -265,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,

return _cderi

def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
omega=None, sr_only=False, device_id=0):
''' Execute CDERI tasks on one device
'''
nq = len(intopt.log_qs)
Expand All @@ -274,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
naoaux = cd_low.shape[0]
npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
pairs_loc = np.append(0, np.cumsum(npairs))
blksize = (naux + _num_devices - 1) // _num_devices
with cupy.cuda.Device(device_id), _streams[device_id]:
assert isinstance(mol.verbose, int)
log = logger.new_logger(mol, mol.verbose)
Expand Down Expand Up @@ -345,13 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
ij0 = pairs_loc[cp_ij_id]
ij1 = pairs_loc[cp_ij_id+1]
if isinstance(_cderi[0], np.ndarray):
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
for i in range(p0,p1):
cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
elif _num_devices > 1:
# Multi-GPU case, copy data to other Devices
for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
# Making a copy for contiguous data transfer
tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
with cupy.cuda.Device(dev_id):
tmp = copy_array(tmp)
_cderi[dev_id][:,ij0:ij1] = tmp
else:
# Copy data to other Devices
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
#_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
_cderi[0][:,ij0:ij1] = cderi_block
t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
return
Loading

0 comments on commit 49f2f56

Please sign in to comment.