Benchmark in nightly build (#295)

* refactor hessian class * fixed bug in df.hessian.uhf * update license * format code * support h function in hessian.jk * unit test * optimize df hessian memory usage * more accurate memory estimate for hessian * _gen_jk -> _get_jk_ip * with_j and with_k for hessian * memory estimate * tested on 095 molecule * improve make_h1 in df.hessian * bugfix * use sorted_mol * update nightly build * assert hermi==1 * typo in uhf.hessian * inject gen_response into soscf * update tests for nightly build * disable benchmark for ci * install pytest-benchmark * change the file names of benchmark tests * disable benchmark for ci * test dir * save changes * add copy_array * assert chunk_shape * improve hcore derivatives * cupy copy -> copy_array * optimize multi-GPU * bugfix for single gpu * update benchmark script * np.isclose * bugfix * auxbasis_response * add benchmark results * split nightly benchmark * optimize df.hessian memory * small fixes * bugfix in df.hessian * bugfix * add benchmark data * remove comments * resolve comments * group_size in hessian * resolve possible memory leak * bugfix * bugfix
pyscf · Jan 7, 2025 · 49f2f56 · 49f2f56
1 parent e55a70e
commit 49f2f56
Show file tree

Hide file tree

Showing 35 changed files with 2,910 additions and 774 deletions.
diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
@@ -14,7 +14,7 @@ permissions:
 jobs:
   build:
 
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux, X64, v100]
 
     steps:
     - uses: actions/checkout@v3
@@ -23,6 +23,7 @@ jobs:
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pytest-benchmark
         pip3 install pyscf --upgrade
         pip3 install numpy --upgrade
         pip3 install scipy --upgrade
@@ -35,8 +36,13 @@ jobs:
         export PATH=${CUDA_HOME}/bin:${PATH}
         export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
         sh build.sh
-    - name: Smoke Test
+    - name: Test RKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest --durations=0
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test UKS
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -21,6 +21,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -38,7 +39,7 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
@@ -48,6 +49,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -65,4 +67,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 **/build
 **/launch_logs
 **/deps
+**/.benchmarks
 core
 **tmp*
 *.egg-info/

diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -0,0 +1,141 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import cupy as cp
+from cupyx import profiler
+from gpu4pyscf.lib.cupy_helper import copy_array
+
+'''
+Benchmark different ways of transfering data from pinned memory to device
+'''
+
+# Host array
+host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8)
+big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array)
+big_host_data = big_host_data.reshape(512,512,512)
+big_host_data += np.random.rand(512,512,512)
+
+# Device array
+big_device_data = cp.empty_like(big_host_data)
+
+# Create views on both arrays
+host_view = big_host_data[:, 128:]  # Non-contiguous view on the host
+device_view = big_device_data[:, 128:]  # Non-contiguous view on the device
+
+print("Host View Shape:", host_view.shape)
+print("Device View Shape:", device_view.shape)
+
+print("------ Benchmark device to host transfer ----------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = cp.asarray(c)
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("------- Benchmark host to device transfer ---------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = c.get()
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (non-contiguous) ---------")
+
+with cp.cuda.Device(0):
+    a = cp.random.rand(512,512,512)
+    device0_view = a[:,128:]
+with cp.cuda.Device(1):
+    b = cp.random.rand(512,512,512)
+    device1_view = b[:,128:]
+perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10
+
+def cupy_copy(c, out):
+    with cp.cuda.Device(out.device):
+        out[:] = cp.asarray(c.get())
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (contiguous) ---------")
+perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy_contiguous(a, b):
+    b[:] = a
+perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy copy contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_asarray_contiguous(a, b):
+    with cp.cuda.Device(b.device):
+        b = cp.asarray(a) 
+perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(a.get() - b.get()) < 1e-10
+
+
+print('----------- Benchmark reduction across devices ------ ')
+from gpu4pyscf.lib.cupy_helper import reduce_to_device
+_num_devices = cp.cuda.runtime.getDeviceCount()
+a_dist = []
+for device_id in range(_num_devices):
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(512,512,512)
+        a_dist.append(a)
+
+perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
@@ -27,7 +27,6 @@
 parser.add_argument("--solvent",      type=str,  default='')
 args = parser.parse_args()
 
-lib.num_threads(16)
 start_time = time.time()
 bas = args.basis
 mol = pyscf.M(

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -20,7 +20,8 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
+from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, 
+                                       cart2sph, p2p_transfer, copy_array)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -142,8 +143,7 @@ def get_blksize(self, extra=0, nao=None):
         log = logger.new_logger(self.mol, self.mol.verbose)
         device_id = cupy.cuda.Device().id
         log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}")
-        if blksize < ALIGNED:
-            raise RuntimeError("Not enough GPU memory")
+        assert blksize > 0
         return blksize
 
     def loop(self, blksize=None, unpack=True):
@@ -226,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         log.debug("Saving CDERI on CPU")
 
     _cderi = {}
-    blksize = (naux + _num_devices - 1) // _num_devices
-    for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
+    aux_blksize = (naux + _num_devices - 1) // _num_devices
+    aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
+    for device_id in range(_num_devices):
+        p0 = min(aux_blksize*device_id, naux)
+        p1 = min(aux_blksize*(device_id+1), naux)
+        #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
         if use_gpu_memory:
             with cupy.cuda.Device(device_id), _streams[device_id]:
                 _cderi[device_id] = cupy.empty([p1-p0, npairs])
-            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}")
+            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
         else:
             mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8)
             cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem)
@@ -253,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             task_list = task_list_per_device[device_id]
-            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
+            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
             futures.append(future)
 
@@ -265,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     return _cderi
 
-def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
+def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, 
+                omega=None, sr_only=False, device_id=0):
     ''' Execute CDERI tasks on one device
     '''
     nq = len(intopt.log_qs)
@@ -274,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
     naoaux = cd_low.shape[0]
     npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
     pairs_loc = np.append(0, np.cumsum(npairs))
-    blksize = (naux + _num_devices - 1) // _num_devices
     with cupy.cuda.Device(device_id), _streams[device_id]:
         assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
@@ -345,13 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             ij0 = pairs_loc[cp_ij_id]
             ij1 = pairs_loc[cp_ij_id+1]
             if isinstance(_cderi[0], np.ndarray):
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    for i in range(p0,p1):
-                        cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
+            elif _num_devices > 1:
+                # Multi-GPU case, copy data to other Devices
+                for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
+                    # Making a copy for contiguous data transfer
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    with cupy.cuda.Device(dev_id):
+                        tmp = copy_array(tmp)
+                        _cderi[dev_id][:,ij0:ij1] = tmp
             else:
-                # Copy data to other Devices
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
-                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
-            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
+                _cderi[0][:,ij0:ij1] = cderi_block
+            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return