lab-cosmo · nickjbrowning · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/benchmarks/pytorch/benchmark.py b/benchmarks/pytorch/benchmark.py
@@ -293,22 +293,24 @@ def loss_fn(xyz_tensor):
             args.t,
             args.normalized,
             device="cpu",
-            dtype=torch.float64,
+            dtype=torch.float32,
             compare=args.compare,
             verbose=args.verbose,
             warmup=args.warmup,
         )
+
         sphericart_benchmark(
             args.l,
             args.s,
             args.t,
             args.normalized,
             device="cpu",
-            dtype=torch.float32,
+            dtype=torch.float64,
             compare=args.compare,
             verbose=args.verbose,
             warmup=args.warmup,
         )
+
 
     if torch.cuda.is_available() and args.gpu:
         sphericart_benchmark(
@@ -317,19 +319,21 @@ def loss_fn(xyz_tensor):
             args.t,
             args.normalized,
             device="cuda",
-            dtype=torch.float64,
+            dtype=torch.float32,
             compare=args.compare,
             verbose=args.verbose,
             warmup=args.warmup,
         )
+
         sphericart_benchmark(
             args.l,
             args.s,
             args.t,
             args.normalized,
             device="cuda",
-            dtype=torch.float32,
+            dtype=torch.float64,
             compare=args.compare,
             verbose=args.verbose,
             warmup=args.warmup,
         )
+
diff --git a/examples/cuda/example.cu b/examples/cuda/example.cu
@@ -1,8 +1,9 @@
 /** @file example.cpp
  *  @brief Usage example for the C++ API
  */
-
+#include "sphericart.hpp"
 #include "sphericart_cuda.hpp"
+#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cuda.h>
@@ -11,6 +12,7 @@
 #include <vector>
 
 using namespace std;
+using namespace sphericart;
 using namespace sphericart::cuda;
 
 /*host macro that checks for errors in CUDA calls, and prints the file + line
@@ -27,52 +29,99 @@ using namespace sphericart::cuda;
         }                                                                      \
     } while (0)
 
-int main() {
+template <class scalar_t> void timing() {
     /* ===== set up the calculation ===== */
 
     // hard-coded parameters for the example
-    size_t n_samples = 10000;
-    size_t l_max = 10;
+    size_t n_samples = 100000;
+    size_t l_max = 32;
 
     // initializes samples
-    auto xyz = std::vector<double>(n_samples * 3, 0.0);
+    auto xyz = std::vector<scalar_t>(n_samples * 3, 0.0);
     for (size_t i = 0; i < n_samples * 3; ++i) {
-        xyz[i] = (double)rand() / (double)RAND_MAX * 2.0 - 1.0;
+        xyz[i] = (scalar_t)rand() / (scalar_t)RAND_MAX * 2.0 - 1.0;
     }
 
     // to avoid unnecessary allocations, calculators can use pre-allocated
     // memory, one also can provide uninitialized vectors that will be
     // automatically reshaped
-    auto sph = std::vector<double>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
+    auto sph =
+        std::vector<scalar_t>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
+
+    auto sph_cpu =
+        std::vector<scalar_t>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
     auto dsph =
-        std::vector<double>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
-    auto ddsph =
-        std::vector<double>(n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);
+        std::vector<scalar_t>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
+    auto ddsph = std::vector<scalar_t>(
+        n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);
 
     /* ===== API calls ===== */
 
     // internal buffers and numerical factors are initalized at construction
-    sphericart::cuda::SphericalHarmonics<double> calculator_cuda(l_max);
+    sphericart::cuda::SphericalHarmonics<scalar_t> calculator_cuda(l_max);
 
-    double *xyz_cuda;
-    CUDA_CHECK(cudaMalloc(&xyz_cuda, n_samples * 3 * sizeof(double)));
-    CUDA_CHECK(cudaMemcpy(xyz_cuda, xyz.data(), n_samples * 3 * sizeof(double),
+    scalar_t *xyz_cuda;
+    CUDA_CHECK(cudaMalloc(&xyz_cuda, n_samples * 3 * sizeof(scalar_t)));
+    CUDA_CHECK(cudaMemcpy(xyz_cuda, xyz.data(),
+                          n_samples * 3 * sizeof(scalar_t),
                           cudaMemcpyHostToDevice));
-    double *sph_cuda;
+    scalar_t *sph_cuda;
     CUDA_CHECK(cudaMalloc(&sph_cuda, n_samples * (l_max + 1) * (l_max + 1) *
-                                         sizeof(double)));
+                                         sizeof(scalar_t)));
+
+    scalar_t *dsph_cuda;
+    CUDA_CHECK(cudaMalloc(&dsph_cuda, 3 * n_samples * (l_max + 1) *
+                                          (l_max + 1) * sizeof(scalar_t)));
+    for (int i = 0; i < 5; i++) {
+        calculator_cuda.compute_with_gradients(xyz_cuda, n_samples, sph_cuda,
+                                               dsph_cuda);
+        // calculator_cuda.compute(xyz_cuda, n_samples, sph_cuda); // no
+        //  gradients
+    }
+
+    std::cout << "-----------------" << std::endl;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    calculator_cuda.compute_with_gradients(xyz_cuda, n_samples, sph_cuda,
+                                           dsph_cuda);
 
-    calculator_cuda.compute(xyz_cuda, n_samples, false, false,
-                            sph_cuda); // no gradients */
+    // calculator_cuda.compute(xyz_cuda, n_samples, sph_cuda); // no gradients
 
+    // Record the end time
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // Calculate the duration
+    auto duration =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+
+    // Print the duration in microseconds
+    std::cout << "Time taken by function: " << duration.count()
+              << " nanoseconds" << std::endl;
+    std::cout << "" << ((double)duration.count()) / ((double)n_samples)
+              << " ns/sample" << std::endl;
+    // */
     CUDA_CHECK(
         cudaMemcpy(sph.data(), sph_cuda,
-                   n_samples * (l_max + 1) * (l_max + 1) * sizeof(double),
+                   n_samples * (l_max + 1) * (l_max + 1) * sizeof(scalar_t),
                    cudaMemcpyDeviceToHost));
 
-    for (int i = 0; i < 4; i++) {
-        std::cout << sph[i] << std::endl;
+    auto calculator = sphericart::SphericalHarmonics<scalar_t>(l_max);
+
+    calculator.compute(xyz, sph_cpu);
+
+    /*for (int i = 0; i < n_samples; i++) {
+        std::cout << "sample: " << xyz[i * 3 + 0] << " " << xyz[i * 3 + 1]
+                  << " " << xyz[i * 3 + 2] << std::endl;
     }
 
+    for (int i = 0; i < n_samples * (l_max + 1) * (l_max + 1); i++) {
+        std::cout << sph[i] << " " << sph_cpu[i] << std::endl;
+    }*/
+}
+
+int main() {
+    timing<double>();
+
     return 0;
 }
diff --git a/setup.py b/setup.py
@@ -97,9 +97,9 @@ def run(self):
         # add a random uuid to the file url to prevent pip from using a cached
         # wheel for sphericart-torch, and force it to re-build from scratch
         uuid_ = uuid.uuid4()
-        extras_require[
-            "torch"
-        ] = f"sphericart-torch @ file://{SPHERICART_TORCH}?{uuid_}"
+        extras_require["torch"] = (
+            f"sphericart-torch @ file://{SPHERICART_TORCH}?{uuid_}"
+        )
     else:
         # installing wheel/sdist
         extras_require["torch"] = "sphericart-torch"

diff --git a/sphericart-torch/src/autograd.cpp b/sphericart-torch/src/autograd.cpp
@@ -222,8 +222,44 @@ torch::autograd::variable_list SphericalHarmonicsAutograd::forward(
                 {1, 1, 1, 1},
                 torch::TensorOptions().dtype(xyz.dtype()).device(xyz.device()));
         }
-
         if (xyz.dtype() == c10::kDouble) {
+
+            if (requires_grad && requires_hessian) {
+                calculator.calculator_cuda_double_.compute_with_hessians(
+                    xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
+                    dsph.data_ptr<double>(), ddsph.data_ptr<double>(), stream);
+            } else if (requires_grad) {
+                calculator.calculator_cuda_double_.compute_with_gradients(
+                    xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
+                    dsph.data_ptr<double>(), stream);
+            } else {
+                calculator.calculator_cuda_double_.compute(
+                    xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
+                    stream);
+            }
+
+        } else if (xyz.dtype() == c10::kFloat) {
+
+            if (requires_grad && requires_hessian) {
+                calculator.calculator_cuda_float_.compute_with_hessians(
+                    xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
+                    dsph.data_ptr<float>(), ddsph.data_ptr<float>(), stream);
+            } else if (requires_grad) {
+                calculator.calculator_cuda_float_.compute_with_gradients(
+                    xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
+                    dsph.data_ptr<float>(), stream);
+            } else {
+                calculator.calculator_cuda_float_.compute(
+                    xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
+                    stream);
+            }
+
+        } else {
+            throw std::runtime_error(
+                "this code only runs on c10::kDouble and c10::kFloat tensors");
+        }
+
+        /*if (xyz.dtype() == c10::kDouble) {
             calculator.calculator_cuda_double_.compute(
                 xyz.data_ptr<double>(), xyz.size(0), requires_grad,
                 requires_hessian, sph.data_ptr<double>(),
@@ -237,7 +273,7 @@ torch::autograd::variable_list SphericalHarmonicsAutograd::forward(
         } else {
             throw std::runtime_error(
                 "this code only runs on c10::kDouble and c10::kFloat tensors");
-        }
+        } */
 
         if (!requires_grad) {
             dsph = torch::Tensor();

diff --git a/sphericart/CMakeLists.txt b/sphericart/CMakeLists.txt
@@ -69,6 +69,7 @@ add_library(sphericart ${COMMON_SOURCES})
 
 if (CMAKE_CUDA_COMPILER AND SPHERICART_ENABLE_CUDA AND SPHERICART_ARCH_NATIVE)
     set_target_properties(sphericart PROPERTIES CUDA_ARCHITECTURES native)
+    set(CMAKE_CUDA_STANDARD 17)
 endif()
 
 
@@ -78,6 +79,7 @@ set_target_properties(sphericart PROPERTIES
     POSITION_INDEPENDENT_CODE ON
 )
 
+
 # we need to compile sphericart with C++17 for if constexpr
 target_compile_features(sphericart PRIVATE cxx_std_17)
 

diff --git a/sphericart/include/cuda_base.hpp b/sphericart/include/cuda_base.hpp
@@ -41,9 +41,6 @@ namespace cuda {
  * @param GRID_DIM_X
  *        The size of the threadblock in the x dimension. Used to parallelize
  *        over the sample dimension
- * @param GRID_DIM_Y
- *        The size of the threadblock in the y dimension. Used only to improve
- *          memory throughput on reads and writes.
  * @param xyz_requires_grad
  *        Boolean representing whether or not the input XYZ requires grad -
  *        required for torch.
@@ -64,14 +61,28 @@ namespace cuda {
  *        Pointer to a cudaStream_t or nullptr. If this is nullptr, the kernel
  * launch will be performed on the default stream.
  */
-template <typename scalar_t>
-void spherical_harmonics_cuda_base(
+
+template <typename scalar_t, int GRID_DIM_X>
+void spherical_harmonics(const scalar_t *__restrict__ xyz, const int nedges,
+                         const scalar_t *__restrict__ prefactors,
+                         const int nprefactors, const int64_t l_max,
+                         const bool normalize, scalar_t *__restrict__ sph,
+                         void *cuda_stream);
+
+template <typename scalar_t,int GRID_DIM_X> 
+void spherical_harmonics_with_gradients(
+    const scalar_t *__restrict__ xyz, const int nedges,
+    const scalar_t *__restrict__ prefactors, const int nprefactors,
+    const int64_t l_max, const bool normalize, scalar_t *__restrict__ sph,
+    scalar_t *__restrict__ dsph, void *cuda_stream);
+
+template <typename scalar_t, int GRID_DIM_X>
+void spherical_harmonics_with_hessians(
     const scalar_t *__restrict__ xyz, const int nedges,
     const scalar_t *__restrict__ prefactors, const int nprefactors,
-    const int64_t l_max, const bool normalize, const int64_t GRID_DIM_X,
-    const int64_t GRID_DIM_Y, const bool gradients, const bool hessian,
-    scalar_t *__restrict__ sph, scalar_t *__restrict__ dsph,
-    scalar_t *__restrict__ ddsph, void *cuda_stream = nullptr);
+    const int64_t l_max, const bool normalize, scalar_t *__restrict__ sph,
+    scalar_t *__restrict__ dsph, scalar_t *__restrict__ ddsph,
+    void *cuda_stream);
 
 template <typename scalar_t>
 void spherical_harmonics_backward_cuda_base(
@@ -92,8 +103,6 @@ void spherical_harmonics_backward_cuda_base(
  *        The maximum degree of the spherical harmonics to be calculated.
  * @param GRID_DIM_X
  *        The size of the threadblock in the x dimension.
- * @param GRID_DIM_Y
- *        The size of the threadblock in the y dimension.
  * @param requires_grad
  *        Boolean representing if we need first-order derivatives.
  * @param requires_hessian
@@ -102,7 +111,7 @@ void spherical_harmonics_backward_cuda_base(
  *        the current size of the shared memory allocation.
  */
 int adjust_shared_memory(size_t element_size, int64_t l_max, int64_t GRID_DIM_X,
-                         int64_t GRID_DIM_Y, bool requires_grad,
+                         bool requires_grad,
                          bool requires_hessian,
                          int64_t current_shared_mem_alloc);
 

diff --git a/sphericart/include/sphericart_cuda.hpp b/sphericart/include/sphericart_cuda.hpp
@@ -73,9 +73,14 @@ template <typename T> class SphericalHarmonics {
      * nullptr, the kernel launch will be performed on the default stream.
      */
 
-    void compute(const T *xyz, size_t nsamples, bool compute_with_gradients,
-                 bool compute_with_hessian, T *sph, T *dsph = nullptr,
-                 T *ddsph = nullptr, void *cuda_stream = nullptr);
+    void compute(const T *xyz, size_t nsamples, T *sph,
+                 void *cuda_stream = nullptr);
+
+    void compute_with_gradients(const T *xyz, size_t nsamples, T *sph, T *dsph,
+                                void *cuda_stream = nullptr);
+
+    void compute_with_hessians(const T *xyz, size_t nsamples, T *sph, T *dsph,
+                               T *ddsph, void *cuda_stream = nullptr);
 
   private:
     size_t l_max; // maximum l value computed by this class
@@ -84,12 +89,14 @@ template <typename T> class SphericalHarmonics {
     T *prefactors_cpu;  // host prefactors buffer
     T *prefactors_cuda; // storage space for prefactors
 
-    int64_t CUDA_GRID_DIM_X_ = 8;
-    int64_t CUDA_GRID_DIM_Y_ = 8;
+    int64_t CUDA_GRID_DIM_X_ = 32;
 
     bool cached_compute_with_gradients = false;
     bool cached_compute_with_hessian = false;
     int64_t _current_shared_mem_allocation = 0;
+
+    void update_cache_and_smem(bool compute_with_gradients,
+                               bool compute_with_hessian);
 };
 
 } // namespace cuda