use cutensor 2.0 (#90)

* use cutensor 2.0 * indent & readme * remove comment * remove dftd3; bump up to 0.7.0
pyscf · Jan 30, 2024 · cbaa083 · cbaa083
1 parent e5215d2
commit cbaa083
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 96 deletions.
diff --git a/README.md b/README.md
@@ -17,9 +17,9 @@ Choose the proper package based on your CUDA environment.
 
 ```cuTensor``` is **highly recommended** for accelerating tensor contractions.
 
-For **CUDA 11.x**, ```python -m cupyx.tools.install_library --cuda 11.x --library cutensor```
+For **CUDA 11.x**, ```pip3 install cutensor-cu11```
 
-For **CUDA 12.x**, ```python -m cupyx.tools.install_library --cuda 12.x --library cutensor```
+For **CUDA 12.x**, ```pip3 install cutensor-cu12```
 
 Compilation
 --------

diff --git a/benchmarks/cupy_helper/benchmark_cutensor.py b/benchmarks/cupy_helper/benchmark_cutensor.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import cupy
+from cupyx import profiler
+from gpu4pyscf.lib.cutensor import contract
+
+print('benchmarking tensor contraction')
+a = cupy.random.random([512,512,512])
+b = cupy.random.random([512,512])
+perf = profiler.benchmark(contract, ('ijk,lk->ijl', a, b), n_repeat=20, n_warmup=3)
+flops = 2*np.prod(a.shape) * b.shape[0]
+print(flops/perf.gpu_times.mean()/1024**3, 'GFLOPS')
+
+print('benchmarking tensor contraction with stride')
+a0 = a[64:480,:,64:480]
+b0 = b[:,64:480]
+perf = profiler.benchmark(contract, ('ijk,lk->ijl', a0, b0), n_repeat=20, n_warmup=3)
+flops = 2*np.prod(a0.shape) * b0.shape[0]
+print(flops/perf.gpu_times.mean()/1024**3, 'GFLOPS')
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -1,6 +1,6 @@
 from . import lib, grad, hessian, solvent, scf, dft
 
-__version__ = '0.6.17'
+__version__ = '0.7.0'
 
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc

diff --git a/gpu4pyscf/df/tests/test_jk.py b/gpu4pyscf/df/tests/test_jk.py
@@ -17,7 +17,6 @@
 import numpy as np
 import cupy
 import pyscf
-import dftd3.pyscf as disp
 from pyscf import lib, df
 from pyscf.geomopt.geometric_solver import optimize
 from gpu4pyscf import scf

diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
@@ -21,88 +21,47 @@
     import cupy_backends.cuda.libs.cutensor  # NOQA
     from cupyx import cutensor
     from cupy_backends.cuda.libs import cutensor as cutensor_backend
-    from cupy_backends.cuda.libs.cutensor import Handle
-
-    _handle = Handle()
-    _modes = {}
-    _contraction_descriptors = {}
-    _contraction_plans = {}
-    _contraction_finds = {}
-
-    cutensor_backend.init(_handle)
     CUTENSOR_ALGO_DEFAULT = cutensor_backend.ALGO_DEFAULT
+    _tensor_descriptors = {}
 except ImportError:
     cutensor = None
     CUTENSOR_ALGO_DEFAULT = None
 
-def _create_mode_with_cache(mode):
-    integer_mode = []
-    for x in mode:
-        if isinstance(x, int):
-            integer_mode.append(x)
-        elif isinstance(x, str):
-            integer_mode.append(ord(x))
-        else:
-            raise TypeError('Cannot create tensor mode: {}'.format(type(x)))
-    key = tuple(integer_mode)
-
-    if key in _modes:
-        mode = _modes[key]
-    else:
+def _auto_create_mode(array, mode):
+    if not isinstance(mode, cutensor.Mode):
         mode = cutensor.create_mode(*mode)
-        _modes[key] = mode
+    if array.ndim != mode.ndim:
+        raise ValueError(
+            'ndim mismatch: {} != {}'.format(array.ndim, mode.ndim))
     return mode
 
-def create_contraction_descriptor(handle,
-                                  a, desc_a, mode_a,
-                                  b, desc_b, mode_b,
-                                  c, desc_c, mode_c):
-    alignment_req_A = cutensor_backend.getAlignmentRequirement(handle, a.data.ptr, desc_a)
-    alignment_req_B = cutensor_backend.getAlignmentRequirement(handle, b.data.ptr, desc_b)
-    alignment_req_C = cutensor_backend.getAlignmentRequirement(handle, c.data.ptr, desc_c)
-
-    key = (handle.ptr, cutensor_backend.COMPUTE_64F,
-           desc_a.ptr, mode_a.data, alignment_req_A,
-           desc_b.ptr, mode_b.data, alignment_req_B,
-           desc_c.ptr, mode_c.data, alignment_req_C)
-
-    if key in _contraction_descriptors:
-        desc = _contraction_descriptors[key]
-        return desc
-
-    desc = cutensor_backend.ContractionDescriptor()
-    cutensor_backend.initContractionDescriptor(
-        handle,
-        desc,
-        desc_a, mode_a.data, alignment_req_A,
-        desc_b, mode_b.data, alignment_req_B,
-        desc_c, mode_c.data, alignment_req_C,
-        desc_c, mode_c.data, alignment_req_C,
-        cutensor_backend.COMPUTE_64F)
-    _contraction_descriptors[key] = desc
-    return desc
-
-def create_contraction_find(handle, algo=CUTENSOR_ALGO_DEFAULT):
-    key = (handle.ptr, algo)
-    if key in _contraction_finds:
-        find = _contraction_finds[key]
-    else:
-        find = cutensor_backend.ContractionFind()
-        cutensor_backend.initContractionFind(handle, find, algo)
-        _contraction_finds[key] = find
-    return find
-
-def create_contraction_plan(handle, desc, find, ws_size):
-    key = (handle.ptr, desc.ptr, find.ptr, ws_size)
-    if key in _contraction_plans:
-        plan = _contraction_plans[key]
-    else:
-        plan = cutensor_backend.ContractionPlan()
-        cutensor_backend.initContractionPlan(handle, plan, desc, find, ws_size)
-        _contraction_plans[key] = plan
-    return plan
+def _create_tensor_descriptor(a):
+    handle = cutensor._get_handle()
+    key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
+    # hard coded
+    alignment_req = 8
+    if key not in _tensor_descriptors:
+        num_modes = a.ndim
+        extent = np.array(a.shape, dtype=np.int64)
+        stride = np.array(a.strides, dtype=np.int64) // a.itemsize
+        cutensor_dtype = cutensor._get_cutensor_dtype(a.dtype)
+        _tensor_descriptors[key] = cutensor.TensorDescriptor(
+            handle.ptr, num_modes, extent.ctypes.data, stride.ctypes.data,
+            cutensor_dtype, alignment_req=alignment_req)
+    return _tensor_descriptors[key]
+
+def contraction(
+    pattern, a, b, alpha, beta,
+    out=None,
+    op_a=cutensor_backend.OP_IDENTITY,
+    op_b=cutensor_backend.OP_IDENTITY,
+    op_c=cutensor_backend.OP_IDENTITY,
+    algo=cutensor_backend.ALGO_DEFAULT,
+    jit_mode=cutensor_backend.JIT_MODE_NONE,
+    compute_desc=0,
+    ws_pref=cutensor_backend.WORKSPACE_RECOMMENDED
+):
 
-def contraction(pattern, a, b, alpha, beta, out=None):
     pattern = pattern.replace(" ", "")
     str_a, rest = pattern.split(',')
     str_b, str_c = rest.split('->')
@@ -119,29 +78,27 @@ def contraction(pattern, a, b, alpha, beta, out=None):
     else:
         c = cupy.empty([shape[k] for k in str_c], order='C')
 
-    desc_a = cutensor.create_tensor_descriptor(a)
-    desc_b = cutensor.create_tensor_descriptor(b)
-    desc_c = cutensor.create_tensor_descriptor(c)
-
-    mode_a = _create_mode_with_cache(mode_a)
-    mode_b = _create_mode_with_cache(mode_b)
-    mode_c = _create_mode_with_cache(mode_c)
-
+    desc_a = _create_tensor_descriptor(a)
+    desc_b = _create_tensor_descriptor(b)
+    desc_c = _create_tensor_descriptor(c)
+
+    mode_a = _auto_create_mode(a, mode_a)
+    mode_b = _auto_create_mode(b, mode_b)
+    mode_c = _auto_create_mode(c, mode_c)
+    operator = cutensor.create_contraction(
+        desc_a, mode_a, op_a, desc_b, mode_b, op_b, desc_c, mode_c, op_c,
+        compute_desc)
+    plan_pref = cutensor.create_plan_preference(algo=algo, jit_mode=jit_mode)
+    ws_size = cutensor_backend.estimateWorkspaceSize(
+        cutensor._get_handle().ptr, operator.ptr, plan_pref.ptr, ws_pref)
+    plan = cutensor.create_plan(operator, plan_pref, ws_limit=ws_size)
+    ws = cupy.empty(ws_size, dtype=np.int8)
     out = c
-    desc = create_contraction_descriptor(_handle, a, desc_a, mode_a, b, desc_b, mode_b, c, desc_c, mode_c)
-    find = create_contraction_find(_handle)
-    ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_RECOMMENDED)
-    try:
-        ws = cupy.empty(ws_size, dtype=np.int8)
-    except Exception:
-        ws_size = cutensor_backend.contractionGetWorkspaceSize(_handle, desc, find, cutensor_backend.WORKSPACE_MIN)
-        ws = cupy.empty(ws_size, dtype=np.int8)
-
-    plan = create_contraction_plan(_handle, desc, find, ws_size)
+
     alpha = np.asarray(alpha)
     beta = np.asarray(beta)
 
-    cutensor_backend.contraction(_handle, plan,
+    cutensor_backend.contract(cutensor._get_handle().ptr, plan.ptr,
                              alpha.ctypes.data, a.data.ptr, b.data.ptr,
                              beta.ctypes.data, c.data.ptr, out.data.ptr,
                              ws.data.ptr, ws_size)