Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate GPU data loading #445

Merged
merged 15 commits into from
Feb 6, 2024
65 changes: 38 additions & 27 deletions src/kbmod/search/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,43 @@

namespace search {

extern "C" void device_allocate_psi_phi_array(PsiPhiArray *data) {
if (!data->cpu_array_allocated()) throw std::runtime_error("CPU data is not allocated.");
if (data->gpu_array_allocated()) throw std::runtime_error("GPU data is already allocated.");
extern "C" void device_allocate_psi_phi_arrays(PsiPhiArray *data) {
if (!data->cpu_array_allocated() || !data->cpu_time_array_allocated()) {
throw std::runtime_error("CPU data is not allocated.");
}
if (data->gpu_array_allocated() || data->gpu_time_array_allocated()) {
throw std::runtime_error("GPU data is already allocated.");
}

// Allocate space for the psi/phi data.
void *device_array_ptr;
checkCudaErrors(cudaMalloc((void **)&device_array_ptr, data->get_total_array_size()));
checkCudaErrors(cudaMemcpy(device_array_ptr, data->get_cpu_array_ptr(), data->get_total_array_size(),
cudaMemcpyHostToDevice));
data->set_gpu_array_ptr(device_array_ptr);

// Allocate space for the times data.
float *device_times_ptr;
long unsigned time_bytes = data->get_num_times() * sizeof(float);
checkCudaErrors(cudaMalloc((void **)&device_times_ptr, time_bytes));
checkCudaErrors(
cudaMemcpy(device_times_ptr, data->get_cpu_time_array_ptr(), time_bytes, cudaMemcpyHostToDevice));
data->set_gpu_time_array_ptr(device_times_ptr);
}

extern "C" void device_free_psi_phi_array(PsiPhiArray *data) {
extern "C" void device_free_psi_phi_arrays(PsiPhiArray *data) {
if (data->gpu_array_allocated()) {
checkCudaErrors(cudaFree(data->get_gpu_array_ptr()));
data->set_gpu_array_ptr(nullptr);
}
if (data->gpu_time_array_allocated()) {
checkCudaErrors(cudaFree(data->get_gpu_time_array_ptr()));
data->set_gpu_time_array_ptr(nullptr);
}
}

__host__ __device__ PsiPhi read_encoded_psi_phi(PsiPhiArrayMeta &params, void *psi_phi_vect, int time,
int row, int col) {
int row, int col) {
// Bounds checking.
if ((row < 0) || (col < 0) || (row >= params.height) || (col >= params.width)) {
return {NO_DATA, NO_DATA};
Expand Down Expand Up @@ -124,11 +141,12 @@ extern "C" __device__ __host__ void SigmaGFilteredIndicesCU(float *values, int n

/*
* Evaluate the likelihood score (as computed with from the psi and phi values) for a single
* given candidate trajectory. Modifies the trajectory in place to update the number of
* given candidate trajectory. Modifies the trajectory in place to update the number of
* observations, likelihood, and flux.
*/
extern "C" __device__ __host__ void evaluateTrajectory(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_vect, float *image_times,
SearchParameters params, Trajectory *candidate) {
extern "C" __device__ __host__ void evaluateTrajectory(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_vect,
float *image_times, SearchParameters params,
Trajectory *candidate) {
// Data structures used for filtering. We fill in only what we need.
float psi_array[MAX_NUM_IMAGES];
float phi_array[MAX_NUM_IMAGES];
Expand Down Expand Up @@ -274,12 +292,10 @@ __global__ void searchFilterImages(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_v
}
}

extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, float *image_times, SearchParameters params,
int num_trajectories, Trajectory *trj_to_search, int num_results,
Trajectory *best_results) {
extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, SearchParameters params, int num_trajectories,
Trajectory *trj_to_search, int num_results, Trajectory *best_results) {
// Allocate Device memory
Trajectory *device_tests;
float *device_img_times;
Trajectory *device_search_results;

// Check the hard coded maximum number of images against the num_images.
Expand All @@ -288,31 +304,27 @@ extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, float *image_time
throw std::runtime_error("Number of images exceeds GPU maximum.");
}

// Check that the device psi_phi vector has been allocated.
// Check that the device vectors have already been allocated.
if (psi_phi_array.gpu_array_allocated() == false) {
throw std::runtime_error("PsiPhi data has not been created.");
}
if (psi_phi_array.gpu_time_array_allocated() == false) {
throw std::runtime_error("GPU time data has not been created.");
}

// Copy trajectories to search
if (params.debug) {
printf("Allocating GPU memory for testing grid with %i elements using %lu bytes.\n", num_trajectories,
sizeof(Trajectory) * num_trajectories);
sizeof(Trajectory) * num_trajectories);
}
checkCudaErrors(cudaMalloc((void **)&device_tests, sizeof(Trajectory) * num_trajectories));
checkCudaErrors(cudaMemcpy(device_tests, trj_to_search, sizeof(Trajectory) * num_trajectories,
cudaMemcpyHostToDevice));

// Copy the time vector.
if (params.debug) {
printf("Allocating GPU memory for time data using %lu bytes.\n", sizeof(float) * num_images);
}
checkCudaErrors(cudaMalloc((void **)&device_img_times, sizeof(float) * num_images));
checkCudaErrors(
cudaMemcpy(device_img_times, image_times, sizeof(float) * num_images, cudaMemcpyHostToDevice));

// Allocate space for the results.
if (params.debug) {
printf("Allocating GPU memory for %i results using %lu bytes.\n", num_results, sizeof(Trajectory) * num_results);
printf("Allocating GPU memory for %i results using %lu bytes.\n", num_results,
sizeof(Trajectory) * num_results);
}
checkCudaErrors(cudaMalloc((void **)&device_search_results, sizeof(Trajectory) * num_results));

Expand All @@ -326,17 +338,16 @@ extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, float *image_time

// Launch Search
searchFilterImages<<<blocks, threads>>>(psi_phi_array.get_meta_data(), psi_phi_array.get_gpu_array_ptr(),
device_img_times, params, num_trajectories, device_tests,
device_search_results);
static_cast<float *>(psi_phi_array.get_gpu_time_array_ptr()),
params, num_trajectories, device_tests, device_search_results);
cudaDeviceSynchronize();

// Read back results
checkCudaErrors(cudaMemcpy(best_results, device_search_results, sizeof(Trajectory) * num_results,
cudaMemcpyDeviceToHost));

// Free the on GPU memory.
// Free the on GPU memory for this specific search.
checkCudaErrors(cudaFree(device_search_results));
checkCudaErrors(cudaFree(device_img_times));
checkCudaErrors(cudaFree(device_tests));
}

Expand Down
8 changes: 4 additions & 4 deletions src/kbmod/search/layered_image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ LayeredImage::LayeredImage(const RawImage& sci, const RawImage& var, const RawIm
variance = var;
}

LayeredImage::LayeredImage(unsigned w, unsigned h, float noise_stdev, float pixel_variance,
double time, const PSF& psf)
LayeredImage::LayeredImage(unsigned w, unsigned h, float noise_stdev, float pixel_variance, double time,
const PSF& psf)
: LayeredImage(w, h, noise_stdev, pixel_variance, time, psf, -1) {}

LayeredImage::LayeredImage(unsigned w, unsigned h, float noise_stdev, float pixel_variance,
double time, const PSF& psf, int seed)
LayeredImage::LayeredImage(unsigned w, unsigned h, float noise_stdev, float pixel_variance, double time,
const PSF& psf, int seed)
: psf(psf), width(w), height(h) {
std::random_device r;
std::default_random_engine generator(r());
Expand Down
90 changes: 73 additions & 17 deletions src/kbmod/search/psi_phi_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ namespace search {

// Declaration of CUDA functions that will be linked in.
#ifdef HAVE_CUDA
extern "C" void device_allocate_psi_phi_array(PsiPhiArray* data);
extern "C" void device_allocate_psi_phi_arrays(PsiPhiArray* data);

extern "C" void device_free_psi_phi_array(PsiPhiArray* data);
extern "C" void device_free_psi_phi_arrays(PsiPhiArray* data);
#endif

// -------------------------------------------------------
Expand All @@ -17,27 +17,23 @@ extern "C" void device_free_psi_phi_array(PsiPhiArray* data);

PsiPhiArray::PsiPhiArray() {}

PsiPhiArray::~PsiPhiArray() {
if (cpu_array_ptr != nullptr) {
free(cpu_array_ptr);
}
#ifdef HAVE_CUDA
if (gpu_array_ptr != nullptr) {
device_free_psi_phi_array(this);
}
#endif
}
PsiPhiArray::~PsiPhiArray() { clear(); }

void PsiPhiArray::clear() {
// Free all used memory on CPU and GPU.
if (cpu_array_ptr != nullptr) {
free(cpu_array_ptr);
cpu_array_ptr = nullptr;
}
if (cpu_time_array != nullptr) {
free(cpu_time_array);
cpu_time_array = nullptr;
}
#ifdef HAVE_CUDA
if (gpu_array_ptr != nullptr) {
device_free_psi_phi_array(this);
if ((gpu_array_ptr != nullptr) || (gpu_time_array != nullptr)) {
device_free_psi_phi_arrays(this);
gpu_array_ptr = nullptr;
gpu_time_array = nullptr;
}
#endif

Expand Down Expand Up @@ -140,6 +136,14 @@ PsiPhi PsiPhiArray::read_psi_phi(int time, int row, int col) {
return result;
}

float PsiPhiArray::read_time(int time_index) {
if (cpu_time_array == nullptr) throw std::runtime_error("Read from unallocated times array.");
if ((time_index < 0) || (time_index >= meta_data.num_times)) {
throw std::runtime_error("Out of bounds read for time step.");
}
return cpu_time_array[time_index];
}

// -------------------------------------------
// --- Implementation of utility functions ---
// -------------------------------------------
Expand Down Expand Up @@ -242,7 +246,8 @@ void set_float_cpu_psi_phi_array(PsiPhiArray& data, const std::vector<RawImage>&
}

void fill_psi_phi_array(PsiPhiArray& result_data, int num_bytes, const std::vector<RawImage>& psi_imgs,
const std::vector<RawImage>& phi_imgs, bool debug) {
const std::vector<RawImage>& phi_imgs, const std::vector<float> zeroed_times,
bool debug) {
if (result_data.get_cpu_array_ptr() != nullptr) {
return;
}
Expand All @@ -251,6 +256,8 @@ void fill_psi_phi_array(PsiPhiArray& result_data, int num_bytes, const std::vect
int num_times = psi_imgs.size();
if (num_times <= 0) throw std::runtime_error("Trying to fill PsiPhi from empty vectors.");
if (num_times != phi_imgs.size()) throw std::runtime_error("Size mismatch between psi and phi.");
if (num_times != zeroed_times.size())
throw std::runtime_error("Size mismatch between psi and zeroed times.");

int width = phi_imgs[0].get_width();
int height = phi_imgs[0].get_height();
Expand Down Expand Up @@ -287,19 +294,61 @@ void fill_psi_phi_array(PsiPhiArray& result_data, int num_bytes, const std::vect
set_float_cpu_psi_phi_array(result_data, psi_imgs, phi_imgs, debug);
}

// Copy the time array.
const long unsigned times_bytes = result_data.get_num_times() * sizeof(float);
if (debug) printf("Allocating %lu bytes on the CPU for times.\n", times_bytes);

float* times_array = (float*)malloc(times_bytes);
if (times_array == nullptr) throw std::runtime_error("Unable to allocate space for CPU times.");
for (int i = 0; i < result_data.get_num_times(); ++i) {
times_array[i] = zeroed_times[i];
}
result_data.set_cpu_time_array_ptr(times_array);

#ifdef HAVE_CUDA
// Create a copy of the encoded data in GPU memory.
if (debug) {
printf("Allocating GPU memory for PsiPhi array using %lu bytes.\n",
result_data.get_total_array_size());
printf("Allocating GPU memory for times array using %lu bytes.\n", times_bytes);
}
device_allocate_psi_phi_array(&result_data);

device_allocate_psi_phi_arrays(&result_data);
if (result_data.get_gpu_array_ptr() == nullptr) {
throw std::runtime_error("Unable to allocate GPU PsiPhi array.");
}
if (result_data.get_gpu_time_array_ptr() == nullptr) {
throw std::runtime_error("Unable to allocate GPU time array.");
}
#endif
}

void fill_psi_phi_array_from_image_stack(PsiPhiArray& result_data, ImageStack& stack, int num_bytes,
bool debug) {
// Compute Phi and Psi from convolved images while leaving masked pixels alone
// Reinsert 0s for NO_DATA?
std::vector<RawImage> psi_images;
std::vector<RawImage> phi_images;
const int num_images = stack.img_count();
if (debug) {
unsigned long num_bytes = 2 * stack.get_height() * stack.get_width() * num_images * sizeof(float);
printf("Building %i temporary %i by %i images (psi and phi), requiring %lu bytes", (num_images * 2),
stack.get_width(), stack.get_height(), num_bytes);
}

// Build the psi and phi images first.
for (int i = 0; i < num_images; ++i) {
LayeredImage& img = stack.get_single_image(i);
psi_images.push_back(img.generate_psi_image());
phi_images.push_back(img.generate_phi_image());
}

// Convert these into an array form. Needs the full psi and phi computed first so the
// encoding can compute the bounds of each array.
std::vector<float> zeroed_times = stack.build_zeroed_times();
fill_psi_phi_array(result_data, num_bytes, psi_images, phi_images, zeroed_times, debug);
}

// -------------------------------------------
// --- Python definitions --------------------
// -------------------------------------------
Expand Down Expand Up @@ -340,13 +389,20 @@ static void psi_phi_array_binding(py::module& m) {
pydocs::DOC_PsiPhiArray_get_cpu_array_allocated)
.def_property_readonly("gpu_array_allocated", &ppa::gpu_array_allocated,
pydocs::DOC_PsiPhiArray_get_gpu_array_allocated)
.def_property_readonly("cpu_time_array_allocated", &ppa::cpu_time_array_allocated,
pydocs::DOC_PsiPhiArray_get_cpu_time_array_allocated)
.def_property_readonly("gpu_time_array_allocated", &ppa::gpu_time_array_allocated,
pydocs::DOC_PsiPhiArray_get_gpu_time_array_allocated)
.def("set_meta_data", &ppa::set_meta_data, pydocs::DOC_PsiPhiArray_set_meta_data)
.def("clear", &ppa::clear, pydocs::DOC_PsiPhiArray_clear)
.def("read_psi_phi", &ppa::read_psi_phi, pydocs::DOC_PsiPhiArray_read_psi_phi);
.def("read_psi_phi", &ppa::read_psi_phi, pydocs::DOC_PsiPhiArray_read_psi_phi)
.def("read_time", &ppa::read_time, pydocs::DOC_PsiPhiArray_read_time);
m.def("compute_scale_params_from_image_vect", &search::compute_scale_params_from_image_vect);
m.def("decode_uint_scalar", &search::decode_uint_scalar);
m.def("encode_uint_scalar", &search::encode_uint_scalar);
m.def("fill_psi_phi_array", &search::fill_psi_phi_array, pydocs::DOC_PsiPhiArray_fill_psi_phi_array);
m.def("fill_psi_phi_array_from_image_stack", &search::fill_psi_phi_array_from_image_stack,
pydocs::DOC_PsiPhiArray_fill_psi_phi_array_from_image_stack);
}
#endif

Expand Down
28 changes: 20 additions & 8 deletions src/kbmod/search/psi_phi_array_ds.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
/*
* psi_phi_array_ds.h
*
* The data structure for the interleaved psi/phi array. The the data
* The data structure for the raw data needed for the search algorith,
* including the psi/phi values and the zeroed times. The the data
* structure and core functions are included in the header (and separated out
* from the rest of the utility functions) to allow the CUDA files to import
* only what they need.
*
* The data structure allocates memory on both the CPU and GPU for the
* interleaved psi/phi array and maintains ownership of the pointers
* until clear() is called or the PsiPhiArray's destructor is called. This allows
* the object to be passed repeatedly to the on-device search without reallocating
* and copying the memory on the GPU.
* arrays and maintains ownership of the pointers until clear() is called
* the object's destructor is called. This allows the object to be passed
* repeatedly to the on-device search without reallocating and copying the
* memory on the GPU. All arrays are stored as pointers (instead of vectors)
* for compatibility with CUDA.
*
* Created on: Dec 5, 2023
*/
Expand Down Expand Up @@ -95,9 +97,12 @@ class PsiPhiArray {

inline bool cpu_array_allocated() { return cpu_array_ptr != nullptr; }
inline bool gpu_array_allocated() { return gpu_array_ptr != nullptr; }
inline bool cpu_time_array_allocated() { return cpu_time_array != nullptr; }
inline bool gpu_time_array_allocated() { return gpu_time_array != nullptr; }

// Primary getter function for interaction (read the data).
PsiPhi read_psi_phi(int time, int row, int col);
// Primary getter functions for interaction (read the data).
PsiPhi read_psi_phi(int time_index, int row, int col);
float read_time(int time_index);

// Setters for the utility functions to allocate the data.
void set_meta_data(int new_num_bytes, int new_num_times, int new_height, int new_width);
Expand All @@ -110,12 +115,19 @@ class PsiPhiArray {
inline void set_cpu_array_ptr(void* new_ptr) { cpu_array_ptr = new_ptr; }
inline void set_gpu_array_ptr(void* new_ptr) { gpu_array_ptr = new_ptr; }

inline float* get_cpu_time_array_ptr() { return cpu_time_array; }
inline float* get_gpu_time_array_ptr() { return gpu_time_array; }
inline void set_cpu_time_array_ptr(float* new_ptr) { cpu_time_array = new_ptr; }
inline void set_gpu_time_array_ptr(float* new_ptr) { gpu_time_array = new_ptr; }

private:
PsiPhiArrayMeta meta_data;

// Pointers the array (CPU space).
// Pointers to the arrays
void* cpu_array_ptr = nullptr;
void* gpu_array_ptr = nullptr;
float* cpu_time_array = nullptr;
float* gpu_time_array = nullptr;
};

} /* namespace search */
Expand Down
Loading
Loading