refactor solvent models (#139)

* refactor solvent models * flake * resolve conflict * fixed bugs due to refactor * fixed typos * fixed one more type * add more unit test for SMD model * more unit test for SMD gradient and Hessian * added to_cpu * to_cpu && to_gpu for solvent * switch to legacy SMD model * pricise unit test for solvent models * remove .mod * address comments in PySCF * bug fix * move python SMD code to experiments * move SMD functionality to SMD module from pcm module
pyscf · May 13, 2024 · 9c1feba · 9c1feba
1 parent ab7085e
commit 9c1feba
Show file tree

Hide file tree

Showing 31 changed files with 4,730 additions and 896 deletions.
diff --git a/.flake8 b/.flake8
@@ -15,7 +15,7 @@ ignore =
         E701, E731, E741, E275,
         F401, C901, W391, W503, W504, W291, W292, W293
 
-exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg, .coveragerc
+exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg, .coveragerc, mnsol.F
 max-line-length = 160
 
 per-file-ignores =

diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@ output/
 *.swp
 *~
 *.log
+*.mod
 *.h5
 **/qchem_input.in
 .coverage

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,8 +45,8 @@ adding features to the library for PySCF functions running on GPU devices.
 * While examples or documentation are not mandatory, it is highly recommended to
   include examples of how to invoke the new module.
 
-* CUDA compute capability 70 (sm_70) is required. Please avoid using features
-  that are only available on CUDA compute capability 80 or newer. The CUDA code
+* CUDA compute capability 60 (sm_60) is required. Please avoid using features
+  that are only available on CUDA compute capability 70 or newer. The CUDA code
   should be compiled and run using CUDA 11 and CUDA 12 toolkits.
 
 Thank you for your contributions!
diff --git a/benchmarks/cupy_helper/benchmark_cart2sph.py b/benchmarks/cupy_helper/benchmark_cart2sph.py
@@ -49,4 +49,26 @@
 t_cutensor = perf_cutensor.gpu_times.mean()
 print('kernel:', t_kernel)
 print('cutensor:', t_cutensor)
-print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
+print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
+
+print('benchmarking cart2sph when ang=5')
+a = cupy.random.random([512,21*128,512])
+b = cupy.random.random([512,11*128,512])
+perf_kernel = profiler.benchmark(cart2sph, (a,1,5,b), n_repeat=20, n_warmup=3)
+perf_cutensor = profiler.benchmark(cart2sph_cutensor, (a,1,5), n_repeat=20, n_warmup=3)
+t_kernel = perf_kernel.gpu_times.mean()
+t_cutensor = perf_cutensor.gpu_times.mean()
+print('kernel:', t_kernel)
+print('cutensor:', t_cutensor)
+print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
+
+print('benchmarking cart2sph when ang=6')
+a = cupy.random.random([512,28*128,512])
+b = cupy.random.random([512,13*128,512])
+perf_kernel = profiler.benchmark(cart2sph, (a,1,6,b), n_repeat=20, n_warmup=3)
+perf_cutensor = profiler.benchmark(cart2sph_cutensor, (a,1,6), n_repeat=20, n_warmup=3)
+t_kernel = perf_kernel.gpu_times.mean()
+t_cutensor = perf_cutensor.gpu_times.mean()
+print('kernel:', t_kernel)
+print('cutensor:', t_cutensor)
+print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
@@ -16,10 +16,9 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 cmake_minimum_required (VERSION 3.19 FATAL_ERROR) # 3.19 is required by cutlass
-project (gpu4pyscf C CXX CUDA)
+project (gpu4pyscf C CXX CUDA Fortran)
 
 set(CMAKE_C_STANDARD "99")
-
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 

diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
@@ -395,7 +395,7 @@ def hermi_triu(mat, hermi=1, inplace=True):
 
 def cart2sph_cutensor(t, axis=0, ang=1, out=None):
     '''
-    transform 'axis' of a tensor from cartesian basis into spherical basis
+    transform 'axis' of a tensor from cartesian basis into spherical basis with cutensor
     '''
     if(ang <= 1):
         if(out is not None): out[:] = t

diff --git a/gpu4pyscf/lib/cupy_helper/cart2sph.cu b/gpu4pyscf/lib/cupy_helper/cart2sph.cu
@@ -111,6 +111,166 @@ static void _cart2sph_ang4(double *cart, double *sph, int stride, int count){
     sph[sph_offset+8*stride] = 0.625835735449176134 * (g0 + g10) - 3.755014412695056800 * g3;
 }
 
+__global__
+static void _cart2sph_ang5(double *cart, double *sph, int stride, int count){
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= count){
+        return;
+    }
+    int i = idx / stride;
+    int j = idx % stride;
+    int sph_offset = 11 * stride * i + j;
+    int cart_offset = 21 * stride * i + j;
+    double g0 = cart[cart_offset+0*stride];
+    double g1 = cart[cart_offset+1*stride];
+    double g2 = cart[cart_offset+2*stride];
+    double g3 = cart[cart_offset+3*stride];
+    double g4 = cart[cart_offset+4*stride];
+    double g5 = cart[cart_offset+5*stride];
+    double g6 = cart[cart_offset+6*stride];
+    double g7 = cart[cart_offset+7*stride];
+    double g8 = cart[cart_offset+8*stride];
+    double g9 = cart[cart_offset+9*stride];
+    double g10 = cart[cart_offset+10*stride];
+    double g11 = cart[cart_offset+11*stride];
+    double g12 = cart[cart_offset+12*stride];
+    double g13 = cart[cart_offset+13*stride];
+    double g14 = cart[cart_offset+14*stride];
+    double g15 = cart[cart_offset+15*stride];
+    double g16 = cart[cart_offset+16*stride];
+    double g17 = cart[cart_offset+17*stride];
+    double g18 = cart[cart_offset+18*stride];
+    double g19 = cart[cart_offset+19*stride];
+    double g20 = cart[cart_offset+20*stride];
+    sph[sph_offset+0*stride] = 3.2819102842008507 * (g1 - 2.0 * g6) + 0.6563820568401701 * g15;
+    sph[sph_offset+1*stride] = 8.3026492595241645 * (g4 - g11);
+    sph[sph_offset+2*stride] = -1.4677148983057511 * g1 + 11.7417191864460086 * g8 + 0.4892382994352504 * (g15 - 2.0 * g6) + -3.9139063954820030 * g17;
+    sph[sph_offset+3*stride] = -4.7935367849733241 * (g4 + g11) + 9.5870735699466483 * g13;
+    sph[sph_offset+4*stride] = 0.4529466511956969 * (g1 + g15 + 2.0 * g6) + -5.4353598143483630 * (g8 + g17) + 3.6235732095655755 * g19;
+    sph[sph_offset+5*stride] = 1.7542548368013540 * (g2 + g16) + 3.5085096736027079 * g7 + -4.6780128981369442 * (g9 + g18) + 0.9356025796273888 * g20;
+    sph[sph_offset+6*stride] = 0.4529466511956969 * (g0 + g10 + 2.0 * g3) + -5.4353598143483630 * (g5 + g12) + 3.6235732095655755 * g14;
+    sph[sph_offset+7*stride] = -2.3967683924866621 * (g2 - g16) + 4.7935367849733241 * (g9 - g18);
+    sph[sph_offset+8*stride] = -0.4892382994352504 * (g0 - 2.0 * g3) + 3.9139063954820030 * g5 + 1.4677148983057511 * g10 + -11.7417191864460086 * g12;
+    sph[sph_offset+9*stride] = 2.0756623148810411 * (g2 + g16) + -12.4539738892862477 * g7;
+    sph[sph_offset+10*stride] = 0.6563820568401701 * g0 + 3.2819102842008507 * (g10 - 2.0*g3);
+}
+
+__global__
+static void _cart2sph_ang6(double *cart, double *sph, int stride, int count){
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= count){
+        return;
+    }
+    int i = idx / stride;
+    int j = idx % stride;
+    int sph_offset = 13 * stride * i + j;
+    int cart_offset = 28 * stride * i + j;
+    double g0 = cart[cart_offset+0*stride];
+    double g1 = cart[cart_offset+1*stride];
+    double g2 = cart[cart_offset+2*stride];
+    double g3 = cart[cart_offset+3*stride];
+    double g4 = cart[cart_offset+4*stride];
+    double g5 = cart[cart_offset+5*stride];
+    double g6 = cart[cart_offset+6*stride];
+    double g7 = cart[cart_offset+7*stride];
+    double g8 = cart[cart_offset+8*stride];
+    double g9 = cart[cart_offset+9*stride];
+    double g10 = cart[cart_offset+10*stride];
+    double g11 = cart[cart_offset+11*stride];
+    double g12 = cart[cart_offset+12*stride];
+    double g13 = cart[cart_offset+13*stride];
+    double g14 = cart[cart_offset+14*stride];
+    double g15 = cart[cart_offset+15*stride];
+    double g16 = cart[cart_offset+16*stride];
+    double g17 = cart[cart_offset+17*stride];
+    double g18 = cart[cart_offset+18*stride];
+    double g19 = cart[cart_offset+19*stride];
+    double g20 = cart[cart_offset+20*stride];
+    double g21 = cart[cart_offset+21*stride];
+    double g22 = cart[cart_offset+22*stride];
+    double g23 = cart[cart_offset+23*stride];
+    double g24 = cart[cart_offset+24*stride];
+    double g25 = cart[cart_offset+25*stride];
+    double g26 = cart[cart_offset+26*stride];
+    double g27 = cart[cart_offset+27*stride];
+    sph[sph_offset+0*stride] = 4.0991046311514863 * (g1 + g15) + -13.6636821038382887 * g6;
+    sph[sph_offset+1*stride] = 11.8330958111587634 * g4 + -23.6661916223175268 * g11 + 2.3666191622317525 * g22;
+    sph[sph_offset+2*stride] = -2.0182596029148963 * (g1 - g15) + 20.1825960291489679 * (g8 - g17);
+    sph[sph_offset+3*stride] = -8.2908473356343109 * g4 + -5.5272315570895412 * g11 + 22.1089262283581647 * g13 + 2.7636157785447706 * g22 + -7.3696420761193888 * g24;
+    sph[sph_offset+4*stride] = 0.9212052595149236 * (g1 + g15 + 2.0 * g6) + -14.7392841522387776 * (g8 + g17 - g19);
+    sph[sph_offset+5*stride] = 2.9131068125936568 * (g4 + g22) + 5.8262136251873136 * g11 + -11.6524272503746271 * (g13 + g24) + 4.6609709001498505 * g26;
+    sph[sph_offset+6*stride] = -0.3178460113381421 * (g0 + g21 + 3.0*g3 + 3.0*g10) + 5.7212282040865583 * (g5 + g23) + 11.4424564081731166 * g12 + -7.6283042721154111 * (g14 + g25) + 1.0171072362820548 * g27;
+    sph[sph_offset+7*stride] = 2.9131068125936568 * (g2 + g16) + 5.8262136251873136 * g7 + -11.6524272503746271 * (g9 + g18) + 4.6609709001498505 * g20;
+    sph[sph_offset+8*stride] = 0.4606026297574618 * (g0 - g10) + 0.4606026297574618 * (g3 - g21) + -7.3696420761193888 * (g5 - g14 - g23 + g25);
+    sph[sph_offset+9*stride] = -2.7636157785447706 * (g2 - 2.0 * g7) + 7.3696420761193888 * g9 + 8.2908473356343109 * g16 + -22.1089262283581647 * g18;
+    sph[sph_offset+10*stride] = -0.5045649007287241 * (g0 + g21) + 2.5228245036436201 * (g3 + g10) + 5.0456490072872420 * (g5 + g23) + -30.2738940437234518 * g12;
+    sph[sph_offset+11*stride] = 2.3666191622317525 * g2 + 11.8330958111587634 * (g16 - 2.0 * g7);
+    sph[sph_offset+12*stride] = 0.6831841051919144 * (g0 - g21) + -10.2477615778787161 * (g3 - g10);
+}
+
+__global__
+static void _cart2sph_ang7(double *cart, double *sph, int stride, int count){
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= count){
+        return;
+    }
+    int i = idx / stride;
+    int j = idx % stride;
+    int sph_offset = 15 * stride * i + j;
+    int cart_offset =36 * stride * i + j;
+    double g0 = cart[cart_offset+0*stride];
+    double g1 = cart[cart_offset+1*stride];
+    double g2 = cart[cart_offset+2*stride];
+    double g3 = cart[cart_offset+3*stride];
+    double g4 = cart[cart_offset+4*stride];
+    double g5 = cart[cart_offset+5*stride];
+    double g6 = cart[cart_offset+6*stride];
+    double g7 = cart[cart_offset+7*stride];
+    double g8 = cart[cart_offset+8*stride];
+    double g9 = cart[cart_offset+9*stride];
+    double g10 = cart[cart_offset+10*stride];
+    double g11 = cart[cart_offset+11*stride];
+    double g12 = cart[cart_offset+12*stride];
+    double g13 = cart[cart_offset+13*stride];
+    double g14 = cart[cart_offset+14*stride];
+    double g15 = cart[cart_offset+15*stride];
+    double g16 = cart[cart_offset+16*stride];
+    double g17 = cart[cart_offset+17*stride];
+    double g18 = cart[cart_offset+18*stride];
+    double g19 = cart[cart_offset+19*stride];
+    double g20 = cart[cart_offset+20*stride];
+    double g21 = cart[cart_offset+21*stride];
+    double g22 = cart[cart_offset+22*stride];
+    double g23 = cart[cart_offset+23*stride];
+    double g24 = cart[cart_offset+24*stride];
+    double g25 = cart[cart_offset+25*stride];
+    double g26 = cart[cart_offset+26*stride];
+    double g27 = cart[cart_offset+27*stride];
+    double g28 = cart[cart_offset+28*stride];
+    double g29 = cart[cart_offset+29*stride];
+    double g30 = cart[cart_offset+30*stride];
+    double g31 = cart[cart_offset+31*stride];
+    double g32 = cart[cart_offset+32*stride];
+    double g33 = cart[cart_offset+33*stride];
+    double g34 = cart[cart_offset+34*stride];
+    double g35 = cart[cart_offset+35*stride];
+    sph[sph_offset+0*stride] = 4.9501391276721742 * g1 + -24.7506956383608703 * g6 + 14.8504173830165218 * g15 + -0.7071627325245963 * g28;
+    sph[sph_offset+1*stride] = 15.8757639708114002 * (g4 + g22) + -52.9192132360380043 * g11;
+    sph[sph_offset+2*stride] = -2.5945778936013020 * (g1 - g6) + 31.1349347232156219 * g8 + 4.6702402084823440 * g15 + -62.2698694464312439 * g17 + -0.5189155787202604 * g28 + 6.2269869446431247 * g30;
+    sph[sph_offset+3*stride] = -12.4539738892862495 * (g4 - g22) + 41.5132462976208316 * (g13 - g24);
+    sph[sph_offset+4*stride] = 1.4081304047606462 * g1 + 2.3468840079344107 * g6 + -28.1626080952129243 * g8 + 0.4693768015868821 * (g15 - g28) + -18.7750720634752817 * g17 + 37.5501441269505705 * g19 + 9.3875360317376408 * g30 + -12.5167147089835229 * g32;
+    sph[sph_offset+5*stride] = 6.6379903866747414 * (g4 + g22) + 13.2759807733494828 * g11 + -35.4026153955986160 * (g13 + g24) + 21.2415692373591725 * g26;
+    sph[sph_offset+6*stride] = -0.4516580379125866 * (g1 + g28) + -1.3549741137377600 * (g6 + g15) + 10.8397929099020782 * (g8 + g30) + 21.6795858198041564 * (g17 - g19 - g32) + 5.7812228852811094 * g34;
+    sph[sph_offset+7*stride] = -2.3899496919201728 * (g2 + g29) + -7.1698490757605189 * (g7 + g16) + 14.3396981515210360 * (g9 + g31) + 28.6793963030420720 * g18 + -11.4717585212168292 * (g20 + g33) + 1.0925484305920790 * g35;
+    sph[sph_offset+8*stride] = -0.4516580379125866 * (g0 + g21) + -1.3549741137377600 * (g3 + g10) + 10.8397929099020782 * (g5 + g23) + 21.6795858198041564 * g12 + -21.6795858198041564 * (g14 + g25) + 5.7812228852811094 * g27;
+    sph[sph_offset+9*stride] = 3.3189951933373707 * (g2 + g7 - g16 - g29) + -17.7013076977993080 * (g9 - g31) + 10.6207846186795862 * (g20 - g33);
+    sph[sph_offset+10*stride] = 0.4693768015868821 * (g0 - g3) + -9.3875360317376408 * g5 + -2.3468840079344107 * g10 + 18.7750720634752817 * g12 + 12.5167147089835229 * g14 + -1.4081304047606462 * g21 + 28.1626080952129243 * g23 + -37.5501441269505705 * g25;
+    sph[sph_offset+11*stride] = -3.1134934723215624 * (g2 + g29) + 15.5674673616078110 * (g7 + g16) + 10.3783115744052079 * (g9 + g31) + -62.2698694464312439 * g18;
+    sph[sph_offset+12*stride] = -0.5189155787202604 * g0 + 4.6702402084823440 * g3 + 6.2269869446431247 * g5 + 2.5945778936013020 * (g10 - g21) + -62.2698694464312439 * g12 + 31.1349347232156219 * g23;
+    sph[sph_offset+13*stride] = 2.6459606618019000 * (g2 - g29) + -39.6894099270284997 * (g7 - g16);
+    sph[sph_offset+14*stride] = 0.7071627325245963 * g0 + -14.8504173830165218 * g3 + 24.7506956383608703 * g10 + -4.9501391276721742 * g21;
+}
+
 extern "C" {
 __host__
 int cart2sph(cudaStream_t stream, double *cart_gto, double *sph_gto, int stride, int count, int ang)
@@ -123,8 +283,11 @@ int cart2sph(cudaStream_t stream, double *cart_gto, double *sph_gto, int stride,
         case 2: _cart2sph_ang2 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
         case 3: _cart2sph_ang3 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
         case 4: _cart2sph_ang4 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
+        case 5: _cart2sph_ang5 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
+        case 6: _cart2sph_ang6 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
+        case 7: _cart2sph_ang7 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
         default:
-            fprintf(stderr, "Ang > 4 is not supported!");
+            fprintf(stderr, "Ang > 7 is not supported!\n");
             return 1;
     }
 

diff --git a/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_pass1.cu b/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_pass1.cu
@@ -52,7 +52,8 @@ static int GINTrun_tasks_int3c2e_pass1_j(JKMatrix *jk, BasisProdOffsets *offsets
                 case 0b0000: GINTint3c2e_pass1_j_kernel0000<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
                 case 0b0010: GINTint3c2e_pass1_j_kernel0010<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
                 case 0b1000: GINTint3c2e_pass1_j_kernel1000<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
-                default: fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
+                default: fprintf(stderr, "rys roots 1 type_ijkl %d\n", type_ijkl);
+                return 1;
                 }
             break;
         case 2: GINTint3c2e_pass1_j_kernel<2, GSIZE2_INT3C> <<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
@@ -78,7 +79,7 @@ static int GINTrun_tasks_int3c2e_pass1_j(JKMatrix *jk, BasisProdOffsets *offsets
 
 extern "C" { __host__
 int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
-                 double *dm, double *rhoj, 
+                 double *dm, double *rhoj,
                  int nao, int naux, int n_dm,
                  int *bins_locs_ij, int *bins_locs_kl,
                  int ncp_ij, int ncp_kl)
@@ -98,22 +99,22 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
     jk.ao_offsets_j = 0;
     jk.ao_offsets_k = nao + 1;
     jk.ao_offsets_l = nao;
-    
+
     int *bas_pairs_locs = bpcache->bas_pairs_locs;
     int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
     cudaStream_t streams[MAX_STREAMS];
     for (n = 0; n < MAX_STREAMS; n++){
         checkCudaErrors(cudaStreamCreate(&streams[n]));
     }
-    
+
     int *idx = (int *)malloc(sizeof(int) * TOT_NF * 3);
-    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2)); 
+    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2));
     GINTinit_index1d_xyz(idx, l_locs);
     checkCudaErrors(cudaMemcpyToSymbol(c_idx, idx, sizeof(int) * TOT_NF*3));
     checkCudaErrors(cudaMemcpyToSymbol(c_l_locs, l_locs, sizeof(int) * (GPU_LMAX + 2)));
     free(idx);
     free(l_locs);
-    
+
     for (int cp_ij_id = 0; cp_ij_id < ncp_ij; cp_ij_id++){
         for (int k = 0; k < ncp_kl; k++, n++){
             int n_stream = n % MAX_STREAMS;
@@ -132,7 +133,7 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
             int ntasks_kl = bins_locs_kl[k+1] - bins_locs_kl[k];
             if (ntasks_kl <= 0) continue;
             if (ntasks_ij <= 0) continue;
-            
+
             BasisProdOffsets offsets;
             offsets.ntasks_ij = ntasks_ij;
             offsets.ntasks_kl = ntasks_kl;
@@ -151,7 +152,7 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
         checkCudaErrors(cudaStreamSynchronize(streams[n]));
         checkCudaErrors(cudaStreamDestroy(streams[n]));
     }
-    
+
     return 0;
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ output/ @@
     *.swp
     *~
     *.log
+    *.mod
     *.h5
     **/qchem_input.in
     .coverage
@@ Expand Down @@