Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Templated CUDA Kernel #105

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions benchmarks/pytorch/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,22 +293,24 @@ def loss_fn(xyz_tensor):
args.t,
args.normalized,
device="cpu",
dtype=torch.float64,
dtype=torch.float32,
compare=args.compare,
verbose=args.verbose,
warmup=args.warmup,
)

sphericart_benchmark(
args.l,
args.s,
args.t,
args.normalized,
device="cpu",
dtype=torch.float32,
dtype=torch.float64,
compare=args.compare,
verbose=args.verbose,
warmup=args.warmup,
)


if torch.cuda.is_available() and args.gpu:
sphericart_benchmark(
Expand All @@ -317,19 +319,21 @@ def loss_fn(xyz_tensor):
args.t,
args.normalized,
device="cuda",
dtype=torch.float64,
dtype=torch.float32,
compare=args.compare,
verbose=args.verbose,
warmup=args.warmup,
)

sphericart_benchmark(
args.l,
args.s,
args.t,
args.normalized,
device="cuda",
dtype=torch.float32,
dtype=torch.float64,
compare=args.compare,
verbose=args.verbose,
warmup=args.warmup,
)

91 changes: 70 additions & 21 deletions examples/cuda/example.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
/** @file example.cpp
* @brief Usage example for the C++ API
*/

#include "sphericart.hpp"
#include "sphericart_cuda.hpp"
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cuda.h>
Expand All @@ -11,6 +12,7 @@
#include <vector>

using namespace std;
using namespace sphericart;
using namespace sphericart::cuda;

/*host macro that checks for errors in CUDA calls, and prints the file + line
Expand All @@ -27,52 +29,99 @@ using namespace sphericart::cuda;
} \
} while (0)

int main() {
template <class scalar_t> void timing() {
/* ===== set up the calculation ===== */

// hard-coded parameters for the example
size_t n_samples = 10000;
size_t l_max = 10;
size_t n_samples = 100000;
size_t l_max = 32;

// initializes samples
auto xyz = std::vector<double>(n_samples * 3, 0.0);
auto xyz = std::vector<scalar_t>(n_samples * 3, 0.0);
for (size_t i = 0; i < n_samples * 3; ++i) {
xyz[i] = (double)rand() / (double)RAND_MAX * 2.0 - 1.0;
xyz[i] = (scalar_t)rand() / (scalar_t)RAND_MAX * 2.0 - 1.0;
}

// to avoid unnecessary allocations, calculators can use pre-allocated
// memory, one also can provide uninitialized vectors that will be
// automatically reshaped
auto sph = std::vector<double>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
auto sph =
std::vector<scalar_t>(n_samples * (l_max + 1) * (l_max + 1), 0.0);

auto sph_cpu =
std::vector<scalar_t>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
auto dsph =
std::vector<double>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
auto ddsph =
std::vector<double>(n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);
std::vector<scalar_t>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
auto ddsph = std::vector<scalar_t>(
n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);

/* ===== API calls ===== */

// internal buffers and numerical factors are initalized at construction
sphericart::cuda::SphericalHarmonics<double> calculator_cuda(l_max);
sphericart::cuda::SphericalHarmonics<scalar_t> calculator_cuda(l_max);

double *xyz_cuda;
CUDA_CHECK(cudaMalloc(&xyz_cuda, n_samples * 3 * sizeof(double)));
CUDA_CHECK(cudaMemcpy(xyz_cuda, xyz.data(), n_samples * 3 * sizeof(double),
scalar_t *xyz_cuda;
CUDA_CHECK(cudaMalloc(&xyz_cuda, n_samples * 3 * sizeof(scalar_t)));
CUDA_CHECK(cudaMemcpy(xyz_cuda, xyz.data(),
n_samples * 3 * sizeof(scalar_t),
cudaMemcpyHostToDevice));
double *sph_cuda;
scalar_t *sph_cuda;
CUDA_CHECK(cudaMalloc(&sph_cuda, n_samples * (l_max + 1) * (l_max + 1) *
sizeof(double)));
sizeof(scalar_t)));

scalar_t *dsph_cuda;
CUDA_CHECK(cudaMalloc(&dsph_cuda, 3 * n_samples * (l_max + 1) *
(l_max + 1) * sizeof(scalar_t)));
for (int i = 0; i < 5; i++) {
calculator_cuda.compute_with_gradients(xyz_cuda, n_samples, sph_cuda,
dsph_cuda);
// calculator_cuda.compute(xyz_cuda, n_samples, sph_cuda); // no
// gradients
}

std::cout << "-----------------" << std::endl;

auto start = std::chrono::high_resolution_clock::now();

calculator_cuda.compute_with_gradients(xyz_cuda, n_samples, sph_cuda,
dsph_cuda);

calculator_cuda.compute(xyz_cuda, n_samples, false, false,
sph_cuda); // no gradients */
// calculator_cuda.compute(xyz_cuda, n_samples, sph_cuda); // no gradients

// Record the end time
auto end = std::chrono::high_resolution_clock::now();

// Calculate the duration
auto duration =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

// Print the duration in microseconds
std::cout << "Time taken by function: " << duration.count()
<< " nanoseconds" << std::endl;
std::cout << "" << ((double)duration.count()) / ((double)n_samples)
<< " ns/sample" << std::endl;
// */
CUDA_CHECK(
cudaMemcpy(sph.data(), sph_cuda,
n_samples * (l_max + 1) * (l_max + 1) * sizeof(double),
n_samples * (l_max + 1) * (l_max + 1) * sizeof(scalar_t),
cudaMemcpyDeviceToHost));

for (int i = 0; i < 4; i++) {
std::cout << sph[i] << std::endl;
auto calculator = sphericart::SphericalHarmonics<scalar_t>(l_max);

calculator.compute(xyz, sph_cpu);

/*for (int i = 0; i < n_samples; i++) {
std::cout << "sample: " << xyz[i * 3 + 0] << " " << xyz[i * 3 + 1]
<< " " << xyz[i * 3 + 2] << std::endl;
}

for (int i = 0; i < n_samples * (l_max + 1) * (l_max + 1); i++) {
std::cout << sph[i] << " " << sph_cpu[i] << std::endl;
}*/
}

int main() {
timing<double>();

return 0;
}
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def run(self):
# add a random uuid to the file url to prevent pip from using a cached
# wheel for sphericart-torch, and force it to re-build from scratch
uuid_ = uuid.uuid4()
extras_require[
"torch"
] = f"sphericart-torch @ file://{SPHERICART_TORCH}?{uuid_}"
extras_require["torch"] = (
f"sphericart-torch @ file://{SPHERICART_TORCH}?{uuid_}"
)
else:
# installing wheel/sdist
extras_require["torch"] = "sphericart-torch"
Expand Down
40 changes: 38 additions & 2 deletions sphericart-torch/src/autograd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,44 @@ torch::autograd::variable_list SphericalHarmonicsAutograd::forward(
{1, 1, 1, 1},
torch::TensorOptions().dtype(xyz.dtype()).device(xyz.device()));
}

if (xyz.dtype() == c10::kDouble) {

if (requires_grad && requires_hessian) {
calculator.calculator_cuda_double_.compute_with_hessians(
xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
dsph.data_ptr<double>(), ddsph.data_ptr<double>(), stream);
} else if (requires_grad) {
calculator.calculator_cuda_double_.compute_with_gradients(
xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
dsph.data_ptr<double>(), stream);
} else {
calculator.calculator_cuda_double_.compute(
xyz.data_ptr<double>(), xyz.size(0), sph.data_ptr<double>(),
stream);
}

} else if (xyz.dtype() == c10::kFloat) {

if (requires_grad && requires_hessian) {
calculator.calculator_cuda_float_.compute_with_hessians(
xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
dsph.data_ptr<float>(), ddsph.data_ptr<float>(), stream);
} else if (requires_grad) {
calculator.calculator_cuda_float_.compute_with_gradients(
xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
dsph.data_ptr<float>(), stream);
} else {
calculator.calculator_cuda_float_.compute(
xyz.data_ptr<float>(), xyz.size(0), sph.data_ptr<float>(),
stream);
}

} else {
throw std::runtime_error(
"this code only runs on c10::kDouble and c10::kFloat tensors");
}

/*if (xyz.dtype() == c10::kDouble) {
calculator.calculator_cuda_double_.compute(
xyz.data_ptr<double>(), xyz.size(0), requires_grad,
requires_hessian, sph.data_ptr<double>(),
Expand All @@ -237,7 +273,7 @@ torch::autograd::variable_list SphericalHarmonicsAutograd::forward(
} else {
throw std::runtime_error(
"this code only runs on c10::kDouble and c10::kFloat tensors");
}
} */

if (!requires_grad) {
dsph = torch::Tensor();
Expand Down
2 changes: 2 additions & 0 deletions sphericart/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ add_library(sphericart ${COMMON_SOURCES})

if (CMAKE_CUDA_COMPILER AND SPHERICART_ENABLE_CUDA AND SPHERICART_ARCH_NATIVE)
set_target_properties(sphericart PROPERTIES CUDA_ARCHITECTURES native)
set(CMAKE_CUDA_STANDARD 17)
endif()


Expand All @@ -78,6 +79,7 @@ set_target_properties(sphericart PROPERTIES
POSITION_INDEPENDENT_CODE ON
)


# we need to compile sphericart with C++17 for if constexpr
target_compile_features(sphericart PRIVATE cxx_std_17)

Expand Down
33 changes: 21 additions & 12 deletions sphericart/include/cuda_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ namespace cuda {
* @param GRID_DIM_X
* The size of the threadblock in the x dimension. Used to parallelize
* over the sample dimension
* @param GRID_DIM_Y
* The size of the threadblock in the y dimension. Used only to improve
* memory throughput on reads and writes.
* @param xyz_requires_grad
* Boolean representing whether or not the input XYZ requires grad -
* required for torch.
Expand All @@ -64,14 +61,28 @@ namespace cuda {
* Pointer to a cudaStream_t or nullptr. If this is nullptr, the kernel
* launch will be performed on the default stream.
*/
template <typename scalar_t>
void spherical_harmonics_cuda_base(

template <typename scalar_t, int GRID_DIM_X>
void spherical_harmonics(const scalar_t *__restrict__ xyz, const int nedges,
const scalar_t *__restrict__ prefactors,
const int nprefactors, const int64_t l_max,
const bool normalize, scalar_t *__restrict__ sph,
void *cuda_stream);

template <typename scalar_t,int GRID_DIM_X>
void spherical_harmonics_with_gradients(
const scalar_t *__restrict__ xyz, const int nedges,
const scalar_t *__restrict__ prefactors, const int nprefactors,
const int64_t l_max, const bool normalize, scalar_t *__restrict__ sph,
scalar_t *__restrict__ dsph, void *cuda_stream);

template <typename scalar_t, int GRID_DIM_X>
void spherical_harmonics_with_hessians(
const scalar_t *__restrict__ xyz, const int nedges,
const scalar_t *__restrict__ prefactors, const int nprefactors,
const int64_t l_max, const bool normalize, const int64_t GRID_DIM_X,
const int64_t GRID_DIM_Y, const bool gradients, const bool hessian,
scalar_t *__restrict__ sph, scalar_t *__restrict__ dsph,
scalar_t *__restrict__ ddsph, void *cuda_stream = nullptr);
const int64_t l_max, const bool normalize, scalar_t *__restrict__ sph,
scalar_t *__restrict__ dsph, scalar_t *__restrict__ ddsph,
void *cuda_stream);

template <typename scalar_t>
void spherical_harmonics_backward_cuda_base(
Expand All @@ -92,8 +103,6 @@ void spherical_harmonics_backward_cuda_base(
* The maximum degree of the spherical harmonics to be calculated.
* @param GRID_DIM_X
* The size of the threadblock in the x dimension.
* @param GRID_DIM_Y
* The size of the threadblock in the y dimension.
* @param requires_grad
* Boolean representing if we need first-order derivatives.
* @param requires_hessian
Expand All @@ -102,7 +111,7 @@ void spherical_harmonics_backward_cuda_base(
* the current size of the shared memory allocation.
*/
int adjust_shared_memory(size_t element_size, int64_t l_max, int64_t GRID_DIM_X,
int64_t GRID_DIM_Y, bool requires_grad,
bool requires_grad,
bool requires_hessian,
int64_t current_shared_mem_alloc);

Expand Down
17 changes: 12 additions & 5 deletions sphericart/include/sphericart_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,14 @@ template <typename T> class SphericalHarmonics {
* nullptr, the kernel launch will be performed on the default stream.
*/

void compute(const T *xyz, size_t nsamples, bool compute_with_gradients,
bool compute_with_hessian, T *sph, T *dsph = nullptr,
T *ddsph = nullptr, void *cuda_stream = nullptr);
void compute(const T *xyz, size_t nsamples, T *sph,
void *cuda_stream = nullptr);

void compute_with_gradients(const T *xyz, size_t nsamples, T *sph, T *dsph,
void *cuda_stream = nullptr);

void compute_with_hessians(const T *xyz, size_t nsamples, T *sph, T *dsph,
T *ddsph, void *cuda_stream = nullptr);

private:
size_t l_max; // maximum l value computed by this class
Expand All @@ -84,12 +89,14 @@ template <typename T> class SphericalHarmonics {
T *prefactors_cpu; // host prefactors buffer
T *prefactors_cuda; // storage space for prefactors

int64_t CUDA_GRID_DIM_X_ = 8;
int64_t CUDA_GRID_DIM_Y_ = 8;
int64_t CUDA_GRID_DIM_X_ = 32;

bool cached_compute_with_gradients = false;
bool cached_compute_with_hessian = false;
int64_t _current_shared_mem_allocation = 0;

void update_cache_and_smem(bool compute_with_gradients,
bool compute_with_hessian);
};

} // namespace cuda
Expand Down
Loading