From 16f58e4690edaad978208128472cdd9f78715ade Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Tue, 19 Sep 2023 11:31:27 -0400
Subject: [PATCH] Split up common.h

---
 src/kbmod/image_base/bindings.cpp             |  46 ----
 src/kbmod/image_base/common.h                 |  36 +++
 src/kbmod/image_base/image_kernels.cu         | 213 +----------------
 src/kbmod/search/KBMOSearch.h                 |   1 +
 src/kbmod/search/bindings.cpp                 |  46 ++++
 src/kbmod/search/kernels.cu                   | 217 +++++++++++++++++-
 .../common.h => search/search_structs.h}      |  31 +--
 7 files changed, 303 insertions(+), 287 deletions(-)
 create mode 100644 src/kbmod/image_base/common.h
 rename src/kbmod/{include/common.h => search/search_structs.h} (82%)
diff --git a/src/kbmod/image_base/bindings.cpp b/src/kbmod/image_base/bindings.cpp
index d30c06ef..c0e68723 100644
--- a/src/kbmod/image_base/bindings.cpp
+++ b/src/kbmod/image_base/bindings.cpp
@@ -20,11 +20,6 @@ using std::to_string;
 PYBIND11_MODULE(search, m) {
     m.attr("KB_NO_DATA") = pybind11::float_(NO_DATA);
     m.attr("HAS_GPU") = pybind11::bool_(HAVE_GPU);
-    py::enum_<StampType>(m, "StampType")
-            .value("STAMP_SUM", StampType::STAMP_SUM)
-            .value("STAMP_MEAN", StampType::STAMP_MEAN)
-            .value("STAMP_MEDIAN", StampType::STAMP_MEDIAN)
-            .export_values();
     py::class_<pf>(m, "psf", py::buffer_protocol(), R"pbdoc(
             Point Spread Function.
 
@@ -176,34 +171,6 @@ PYBIND11_MODULE(search, m) {
             .def("get_width", &is::getWidth)
             .def("get_height", &is::getHeight)
             .def("get_npixels", &is::getNPixels);
-    py::class_<trajectory>(m, "trajectory", R"pbdoc(
-            A trajectory structure holding basic information about potential results.
-            )pbdoc")
-            .def(py::init<>())
-            .def_readwrite("x_v", &trajectory::xVel)
-            .def_readwrite("y_v", &trajectory::yVel)
-            .def_readwrite("lh", &trajectory::lh)
-            .def_readwrite("flux", &trajectory::flux)
-            .def_readwrite("x", &trajectory::x)
-            .def_readwrite("y", &trajectory::y)
-            .def_readwrite("obs_count", &trajectory::obsCount)
-            .def("__repr__",
-                 [](const trajectory &t) {
-                     return "lh: " + to_string(t.lh) + " flux: " + to_string(t.flux) +
-                            " x: " + to_string(t.x) + " y: " + to_string(t.y) + " x_v: " + to_string(t.xVel) +
-                            " y_v: " + to_string(t.yVel) + " obs_count: " + to_string(t.obsCount);
-                 })
-            .def(py::pickle(
-                    [](const trajectory &p) {  // __getstate__
-                        return py::make_tuple(p.xVel, p.yVel, p.lh, p.flux, p.x, p.y, p.obsCount);
-                    },
-                    [](py::tuple t) {  // __setstate__
-                        if (t.size() != 7) throw std::runtime_error("Invalid state!");
-                        trajectory trj = {t[0].cast<float>(), t[1].cast<float>(), t[2].cast<float>(),
-                                          t[3].cast<float>(), t[4].cast<short>(), t[5].cast<short>(),
-                                          t[6].cast<short>()};
-                        return trj;
-                    }));
     py::class_<pixelPos>(m, "pixel_pos")
             .def(py::init<>())
             .def_readwrite("x", &pixelPos::x)
@@ -217,17 +184,4 @@ PYBIND11_MODULE(search, m) {
             .def_readwrite("m11", &imageMoments::m11)
             .def_readwrite("m02", &imageMoments::m02)
             .def_readwrite("m20", &imageMoments::m20);
-    py::class_<stampParameters>(m, "stamp_parameters")
-            .def(py::init<>())
-            .def_readwrite("radius", &stampParameters::radius)
-            .def_readwrite("stamp_type", &stampParameters::stamp_type)
-            .def_readwrite("do_filtering", &stampParameters::do_filtering)
-            .def_readwrite("center_thresh", &stampParameters::center_thresh)
-            .def_readwrite("peak_offset_x", &stampParameters::peak_offset_x)
-            .def_readwrite("peak_offset_y", &stampParameters::peak_offset_y)
-            .def_readwrite("m01", &stampParameters::m01_limit)
-            .def_readwrite("m10", &stampParameters::m10_limit)
-            .def_readwrite("m11", &stampParameters::m11_limit)
-            .def_readwrite("m02", &stampParameters::m02_limit)
-            .def_readwrite("m20", &stampParameters::m20_limit);
 }
diff --git a/src/kbmod/image_base/common.h b/src/kbmod/image_base/common.h
new file mode 100644
index 00000000..0d97729f
--- /dev/null
+++ b/src/kbmod/image_base/common.h
@@ -0,0 +1,36 @@
+/*
+ * common.h
+ *
+ *  Created on: Jun 20, 2017
+ *      Author: kbmod-usr
+ */
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+#ifdef HAVE_CUDA
+    constexpr bool HAVE_GPU = true;
+#else
+    constexpr bool HAVE_GPU = false;
+#endif
+
+constexpr float NO_DATA = -9999.0;
+constexpr unsigned int MAX_KERNEL_RADIUS = 15;
+
+// The position (in pixels) on an image.
+struct pixelPos {
+    float x;
+    float y;
+};
+
+// Basic image moments use for analysis.
+struct imageMoments {
+    float m00;
+    float m01;
+    float m10;
+    float m11;
+    float m02;
+    float m20;
+};
+
+#endif /* COMMON_H_ */
diff --git a/src/kbmod/image_base/image_kernels.cu b/src/kbmod/image_base/image_kernels.cu
index 4bac0417..b3845091 100644
--- a/src/kbmod/image_base/image_kernels.cu
+++ b/src/kbmod/image_base/image_kernels.cu
@@ -8,7 +8,7 @@
 #ifndef IMAGE_KERNELS_CU_
 #define IMAGE_KERNELS_CU_
 
-#define MAX_STAMP_IMAGES 200
+constexpr unsigned short CONV_THREAD_DIM = 32;
 
 #include <assert.h>
 #include "common.h"
@@ -16,8 +16,6 @@
 #include <stdio.h>
 #include <float.h>
 
-#include "ImageStack.h"
-
 namespace image_base {
 
 /*
@@ -88,215 +86,6 @@ extern "C" void deviceConvolve(float *sourceImg, float *resultImg, int width, in
     checkCudaErrors(cudaFree(deviceResultImg));
 }
 
-__global__ void device_get_coadd_stamp(int num_images, int width, int height, float *image_vect,
-                                       perImageData image_data, int num_trajectories,
-                                       trajectory *trajectories, stampParameters params, int *use_index_vect,
-                                       float *results) {
-    // Get the trajectory that we are going to be using.
-    const int trj_index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (trj_index < 0 || trj_index >= num_trajectories) return;
-    trajectory trj = trajectories[trj_index];
-
-    // Get the pixel coordinates within the stamp to use.
-    const int stamp_width = 2 * params.radius + 1;
-    const int stamp_x = threadIdx.y;
-    if (stamp_x < 0 || stamp_x >= stamp_width) return;
-
-    const int stamp_y = threadIdx.z;
-    if (stamp_y < 0 || stamp_y >= stamp_width) return;
-
-    // Compute the various offsets for the indices.
-    int use_index_offset = num_images * trj_index;
-    int trj_offset = trj_index * stamp_width * stamp_width;
-    int pixel_index = stamp_width * stamp_y + stamp_x;
-
-    // Allocate space for the coadd information.
-    assert(num_images < MAX_STAMP_IMAGES);
-    float values[MAX_STAMP_IMAGES];
-
-    // Loop over each image and compute the stamp.
-    int num_values = 0;
-    for (int t = 0; t < num_images; ++t) {
-        // Skip entries marked 0 in the use_index_vect.
-        if (use_index_vect != nullptr && use_index_vect[use_index_offset + t] == 0) {
-            continue;
-        }
-
-        // Predict the trajectory's position including the barycentric correction if needed.
-        float cTime = image_data.imageTimes[t];
-        int currentX = int(trj.x + trj.xVel * cTime);
-        int currentY = int(trj.y + trj.yVel * cTime);
-        if (image_data.baryCorrs != nullptr) {
-            baryCorrection bc = image_data.baryCorrs[t];
-            currentX = int(trj.x + trj.xVel * cTime + bc.dx + trj.x * bc.dxdx + trj.y * bc.dxdy);
-            currentY = int(trj.y + trj.yVel * cTime + bc.dy + trj.x * bc.dydx + trj.y * bc.dydy);
-        }
-
-        // Get the stamp and add it to the list of values.
-        int img_x = currentX - params.radius + stamp_x;
-        int img_y = currentY - params.radius + stamp_y;
-        if ((img_x >= 0) && (img_x < width) && (img_y >= 0) && (img_y < height)) {
-            int pixel_index = width * height * t + img_y * width + img_x;
-            if (image_vect[pixel_index] != NO_DATA) {
-                values[num_values] = image_vect[pixel_index];
-                ++num_values;
-            }
-        }
-    }
-
-    // If there are no values, just return 0.
-    if (num_values == 0) {
-        results[trj_offset + pixel_index] = 0.0;
-        return;
-    }
-
-    // Do the actual computation from the values.
-    float result = 0.0;
-    int median_ind = num_values / 2;  // Outside switch to avoid compiler warnings.
-    switch (params.stamp_type) {
-        case STAMP_MEDIAN:
-            // Sort the values in ascending order.
-            for (int j = 0; j < num_values; j++) {
-                for (int k = j + 1; k < num_values; k++) {
-                    if (values[j] > values[k]) {
-                        float tmp = values[j];
-                        values[j] = values[k];
-                        values[k] = tmp;
-                    }
-                }
-            }
-
-            // Take the median value of the pixels with data.
-            if (num_values % 2 == 0) {
-                result = (values[median_ind] + values[median_ind - 1]) / 2.0;
-            } else {
-                result = values[median_ind];
-            }
-            break;
-        case STAMP_SUM:
-            for (int t = 0; t < num_values; ++t) {
-                result += values[t];
-            }
-            break;
-        case STAMP_MEAN:
-            for (int t = 0; t < num_values; ++t) {
-                result += values[t];
-            }
-            result /= float(num_values);
-            break;
-    }
-
-    // Save the result to the correct pixel location.
-    results[trj_offset + pixel_index] = result;
-}
-
-void deviceGetCoadds(ImageStack &stack, perImageData image_data, int num_trajectories,
-                     trajectory *trajectories, stampParameters params,
-                     std::vector<std::vector<bool>> &use_index_vect, float *results) {
-    // Allocate Device memory
-    trajectory *device_trjs;
-    int *device_use_index = nullptr;
-    float *device_times;
-    float *device_img;
-    float *device_res;
-    baryCorrection *deviceBaryCorrs = nullptr;
-
-    // Compute the dimensions for the data.
-    const unsigned int num_images = stack.imgCount();
-    const unsigned int width = stack.getWidth();
-    const unsigned int height = stack.getHeight();
-    const unsigned int num_image_pixels = num_images * width * height;
-    const unsigned int stamp_width = 2 * params.radius + 1;
-    const unsigned int stamp_ppi = (2 * params.radius + 1) * (2 * params.radius + 1);
-    const unsigned int num_stamp_pixels = num_trajectories * stamp_ppi;
-
-    // Allocate and copy the trajectories.
-    checkCudaErrors(cudaMalloc((void **)&device_trjs, sizeof(trajectory) * num_trajectories));
-    checkCudaErrors(cudaMemcpy(device_trjs, trajectories, sizeof(trajectory) * num_trajectories,
-                               cudaMemcpyHostToDevice));
-
-    // Check if we need to create a vector of per-trajectory, per-image use.
-    // Convert the vector of booleans into an integer array so we do a cudaMemcpy.
-    if (use_index_vect.size() == num_trajectories) {
-        checkCudaErrors(cudaMalloc((void **)&device_use_index, sizeof(int) * num_images * num_trajectories));
-
-        int *start_ptr = device_use_index;
-        std::vector<int> int_vect(num_images, 0);
-        for (unsigned i = 0; i < num_trajectories; ++i) {
-            assert(use_index_vect[i].size() == num_images);
-            for (unsigned t = 0; t < num_images; ++t) {
-                int_vect[t] = use_index_vect[i][t] ? 1 : 0;
-            }
-
-            checkCudaErrors(
-                    cudaMemcpy(start_ptr, int_vect.data(), sizeof(int) * num_images, cudaMemcpyHostToDevice));
-            start_ptr += num_images;
-        }
-    }
-
-    // Allocate and copy the times.
-    checkCudaErrors(cudaMalloc((void **)&device_times, sizeof(float) * num_images));
-    checkCudaErrors(cudaMemcpy(device_times, image_data.imageTimes, sizeof(float) * num_images,
-                               cudaMemcpyHostToDevice));
-
-    // Allocate and copy the images.
-    checkCudaErrors(cudaMalloc((void **)&device_img, sizeof(float) * num_image_pixels));
-    float *next_ptr = device_img;
-    for (unsigned t = 0; t < num_images; ++t) {
-        const std::vector<float> &data_ref = stack.getSingleImage(t).getScience().getPixels();
-
-        assert(data_ref.size() == width * height);
-        checkCudaErrors(cudaMemcpy(next_ptr, data_ref.data(), sizeof(float) * width * height,
-                                   cudaMemcpyHostToDevice));
-        next_ptr += width * height;
-    }
-
-    // Allocate space for the results.
-    checkCudaErrors(cudaMalloc((void **)&device_res, sizeof(float) * num_stamp_pixels));
-
-    // Allocate memory for and copy barycentric corrections (if needed).
-    if (image_data.baryCorrs != nullptr) {
-        checkCudaErrors(cudaMalloc((void **)&deviceBaryCorrs, sizeof(baryCorrection) * num_images));
-        checkCudaErrors(cudaMemcpy(deviceBaryCorrs, image_data.baryCorrs, sizeof(baryCorrection) * num_images,
-                                   cudaMemcpyHostToDevice));
-    }
-
-    // Wrap the per-image data into a struct. This struct will be copied by value
-    // during the function call, so we don't need to allocate memory for the
-    // struct itself. We just set the pointers to the on device vectors.
-    perImageData device_image_data;
-    device_image_data.numImages = num_images;
-    device_image_data.imageTimes = device_times;
-    device_image_data.baryCorrs = deviceBaryCorrs;
-    device_image_data.psiParams = nullptr;
-    device_image_data.phiParams = nullptr;
-
-    dim3 blocks(num_trajectories, 1, 1);
-    dim3 threads(1, stamp_width, stamp_width);
-
-    // Create the stamps.
-    device_get_coadd_stamp<<<blocks, threads>>>(num_images, width, height, device_img, device_image_data,
-                                                num_trajectories, device_trjs, params, device_use_index,
-                                                device_res);
-    cudaDeviceSynchronize();
-
-    // Free up the unneeded memory (everything except for the on-device results).
-    if (deviceBaryCorrs != nullptr) checkCudaErrors(cudaFree(deviceBaryCorrs));
-    checkCudaErrors(cudaFree(device_img));
-    if (device_use_index != nullptr) checkCudaErrors(cudaFree(device_use_index));
-    checkCudaErrors(cudaFree(device_times));
-    checkCudaErrors(cudaFree(device_trjs));
-    cudaDeviceSynchronize();
-
-    // Read back results
-    checkCudaErrors(
-            cudaMemcpy(results, device_res, sizeof(float) * num_stamp_pixels, cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
-
-    // Free the rest of the  on GPU memory.
-    checkCudaErrors(cudaFree(device_res));
-}
-
 } /* namespace image_base */
 
 #endif /* IMAGE_KERNELS_CU_ */
diff --git a/src/kbmod/search/KBMOSearch.h b/src/kbmod/search/KBMOSearch.h
index 4e5e0564..ca6a43de 100644
--- a/src/kbmod/search/KBMOSearch.h
+++ b/src/kbmod/search/KBMOSearch.h
@@ -23,6 +23,7 @@
 #include "common.h"
 #include "ImageStack.h"
 #include "PointSpreadFunc.h"
+#include "search_structs.h"
 
 using image_base::ImageStack;
 using image_base::LayeredImage;
diff --git a/src/kbmod/search/bindings.cpp b/src/kbmod/search/bindings.cpp
index a3032936..3abc0a3f 100644
--- a/src/kbmod/search/bindings.cpp
+++ b/src/kbmod/search/bindings.cpp
@@ -15,6 +15,52 @@ using ks = search::KBMOSearch;
 using std::to_string;
 
 PYBIND11_MODULE(search, m) {
+    py::enum_<StampType>(m, "StampType")
+            .value("STAMP_SUM", StampType::STAMP_SUM)
+            .value("STAMP_MEAN", StampType::STAMP_MEAN)
+            .value("STAMP_MEDIAN", StampType::STAMP_MEDIAN)
+            .export_values();
+    py::class_<trajectory>(m, "trajectory", R"pbdoc(
+            A trajectory structure holding basic information about potential results.
+            )pbdoc")
+            .def(py::init<>())
+            .def_readwrite("x_v", &trajectory::xVel)
+            .def_readwrite("y_v", &trajectory::yVel)
+            .def_readwrite("lh", &trajectory::lh)
+            .def_readwrite("flux", &trajectory::flux)
+            .def_readwrite("x", &trajectory::x)
+            .def_readwrite("y", &trajectory::y)
+            .def_readwrite("obs_count", &trajectory::obsCount)
+            .def("__repr__",
+                 [](const trajectory &t) {
+                     return "lh: " + to_string(t.lh) + " flux: " + to_string(t.flux) +
+                            " x: " + to_string(t.x) + " y: " + to_string(t.y) + " x_v: " + to_string(t.xVel) +
+                            " y_v: " + to_string(t.yVel) + " obs_count: " + to_string(t.obsCount);
+                 })
+            .def(py::pickle(
+                    [](const trajectory &p) {  // __getstate__
+                        return py::make_tuple(p.xVel, p.yVel, p.lh, p.flux, p.x, p.y, p.obsCount);
+                    },
+                    [](py::tuple t) {  // __setstate__
+                        if (t.size() != 7) throw std::runtime_error("Invalid state!");
+                        trajectory trj = {t[0].cast<float>(), t[1].cast<float>(), t[2].cast<float>(),
+                                          t[3].cast<float>(), t[4].cast<short>(), t[5].cast<short>(),
+                                          t[6].cast<short>()};
+                        return trj;
+                    }));
+    py::class_<stampParameters>(m, "stamp_parameters")
+            .def(py::init<>())
+            .def_readwrite("radius", &stampParameters::radius)
+            .def_readwrite("stamp_type", &stampParameters::stamp_type)
+            .def_readwrite("do_filtering", &stampParameters::do_filtering)
+            .def_readwrite("center_thresh", &stampParameters::center_thresh)
+            .def_readwrite("peak_offset_x", &stampParameters::peak_offset_x)
+            .def_readwrite("peak_offset_y", &stampParameters::peak_offset_y)
+            .def_readwrite("m01", &stampParameters::m01_limit)
+            .def_readwrite("m10", &stampParameters::m10_limit)
+            .def_readwrite("m11", &stampParameters::m11_limit)
+            .def_readwrite("m02", &stampParameters::m02_limit)
+            .def_readwrite("m20", &stampParameters::m20_limit);
     py::class_<ks>(m, "stack_search")
             .def(py::init<is &>())
             .def("save_psi_phi", &ks::savePsiPhi)
diff --git a/src/kbmod/search/kernels.cu b/src/kbmod/search/kernels.cu
index b304fb39..43b024d9 100644
--- a/src/kbmod/search/kernels.cu
+++ b/src/kbmod/search/kernels.cu
@@ -9,14 +9,20 @@
 #define KERNELS_CU_
 #define GPU_LC_FILTER 1
 #define MAX_NUM_IMAGES 140
+#define MAX_STAMP_IMAGES 200
 
-#include "common.h"
 #include <cmath>
 #include "cuda_errors.h"
 #include <stdexcept>
 #include <stdio.h>
 #include <float.h>
 
+#include "common.h"
+#include "ImageStack.h"
+#include "search_structs.h"
+
+using image_base::ImageStack;
+
 namespace search {
 
 extern "C" __device__ __host__ void sigmaGFilteredIndicesCU(float *values, int num_values, float sGL0,
@@ -409,6 +415,215 @@ extern "C" void deviceSearchFilter(int imageCount, int width, int height, float
     checkCudaErrors(cudaFree(deviceTests));
 }
 
+__global__ void device_get_coadd_stamp(int num_images, int width, int height, float *image_vect,
+                                       perImageData image_data, int num_trajectories,
+                                       trajectory *trajectories, stampParameters params, int *use_index_vect,
+                                       float *results) {
+    // Get the trajectory that we are going to be using.
+    const int trj_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (trj_index < 0 || trj_index >= num_trajectories) return;
+    trajectory trj = trajectories[trj_index];
+
+    // Get the pixel coordinates within the stamp to use.
+    const int stamp_width = 2 * params.radius + 1;
+    const int stamp_x = threadIdx.y;
+    if (stamp_x < 0 || stamp_x >= stamp_width) return;
+
+    const int stamp_y = threadIdx.z;
+    if (stamp_y < 0 || stamp_y >= stamp_width) return;
+
+    // Compute the various offsets for the indices.
+    int use_index_offset = num_images * trj_index;
+    int trj_offset = trj_index * stamp_width * stamp_width;
+    int pixel_index = stamp_width * stamp_y + stamp_x;
+
+    // Allocate space for the coadd information.
+    assert(num_images < MAX_STAMP_IMAGES);
+    float values[MAX_STAMP_IMAGES];
+
+    // Loop over each image and compute the stamp.
+    int num_values = 0;
+    for (int t = 0; t < num_images; ++t) {
+        // Skip entries marked 0 in the use_index_vect.
+        if (use_index_vect != nullptr && use_index_vect[use_index_offset + t] == 0) {
+            continue;
+        }
+
+        // Predict the trajectory's position including the barycentric correction if needed.
+        float cTime = image_data.imageTimes[t];
+        int currentX = int(trj.x + trj.xVel * cTime);
+        int currentY = int(trj.y + trj.yVel * cTime);
+        if (image_data.baryCorrs != nullptr) {
+            baryCorrection bc = image_data.baryCorrs[t];
+            currentX = int(trj.x + trj.xVel * cTime + bc.dx + trj.x * bc.dxdx + trj.y * bc.dxdy);
+            currentY = int(trj.y + trj.yVel * cTime + bc.dy + trj.x * bc.dydx + trj.y * bc.dydy);
+        }
+
+        // Get the stamp and add it to the list of values.
+        int img_x = currentX - params.radius + stamp_x;
+        int img_y = currentY - params.radius + stamp_y;
+        if ((img_x >= 0) && (img_x < width) && (img_y >= 0) && (img_y < height)) {
+            int pixel_index = width * height * t + img_y * width + img_x;
+            if (image_vect[pixel_index] != NO_DATA) {
+                values[num_values] = image_vect[pixel_index];
+                ++num_values;
+            }
+        }
+    }
+
+    // If there are no values, just return 0.
+    if (num_values == 0) {
+        results[trj_offset + pixel_index] = 0.0;
+        return;
+    }
+
+    // Do the actual computation from the values.
+    float result = 0.0;
+    int median_ind = num_values / 2;  // Outside switch to avoid compiler warnings.
+    switch (params.stamp_type) {
+        case STAMP_MEDIAN:
+            // Sort the values in ascending order.
+            for (int j = 0; j < num_values; j++) {
+                for (int k = j + 1; k < num_values; k++) {
+                    if (values[j] > values[k]) {
+                        float tmp = values[j];
+                        values[j] = values[k];
+                        values[k] = tmp;
+                    }
+                }
+            }
+
+            // Take the median value of the pixels with data.
+            if (num_values % 2 == 0) {
+                result = (values[median_ind] + values[median_ind - 1]) / 2.0;
+            } else {
+                result = values[median_ind];
+            }
+            break;
+        case STAMP_SUM:
+            for (int t = 0; t < num_values; ++t) {
+                result += values[t];
+            }
+            break;
+        case STAMP_MEAN:
+            for (int t = 0; t < num_values; ++t) {
+                result += values[t];
+            }
+            result /= float(num_values);
+            break;
+    }
+
+    // Save the result to the correct pixel location.
+    results[trj_offset + pixel_index] = result;
+}
+
+void deviceGetCoadds(ImageStack &stack, perImageData image_data, int num_trajectories,
+                     trajectory *trajectories, stampParameters params,
+                     std::vector<std::vector<bool>> &use_index_vect, float *results) {
+    // Allocate Device memory
+    trajectory *device_trjs;
+    int *device_use_index = nullptr;
+    float *device_times;
+    float *device_img;
+    float *device_res;
+    baryCorrection *deviceBaryCorrs = nullptr;
+
+    // Compute the dimensions for the data.
+    const unsigned int num_images = stack.imgCount();
+    const unsigned int width = stack.getWidth();
+    const unsigned int height = stack.getHeight();
+    const unsigned int num_image_pixels = num_images * width * height;
+    const unsigned int stamp_width = 2 * params.radius + 1;
+    const unsigned int stamp_ppi = (2 * params.radius + 1) * (2 * params.radius + 1);
+    const unsigned int num_stamp_pixels = num_trajectories * stamp_ppi;
+
+    // Allocate and copy the trajectories.
+    checkCudaErrors(cudaMalloc((void **)&device_trjs, sizeof(trajectory) * num_trajectories));
+    checkCudaErrors(cudaMemcpy(device_trjs, trajectories, sizeof(trajectory) * num_trajectories,
+                               cudaMemcpyHostToDevice));
+
+    // Check if we need to create a vector of per-trajectory, per-image use.
+    // Convert the vector of booleans into an integer array so we do a cudaMemcpy.
+    if (use_index_vect.size() == num_trajectories) {
+        checkCudaErrors(cudaMalloc((void **)&device_use_index, sizeof(int) * num_images * num_trajectories));
+
+        int *start_ptr = device_use_index;
+        std::vector<int> int_vect(num_images, 0);
+        for (unsigned i = 0; i < num_trajectories; ++i) {
+            assert(use_index_vect[i].size() == num_images);
+            for (unsigned t = 0; t < num_images; ++t) {
+                int_vect[t] = use_index_vect[i][t] ? 1 : 0;
+            }
+
+            checkCudaErrors(
+                    cudaMemcpy(start_ptr, int_vect.data(), sizeof(int) * num_images, cudaMemcpyHostToDevice));
+            start_ptr += num_images;
+        }
+    }
+
+    // Allocate and copy the times.
+    checkCudaErrors(cudaMalloc((void **)&device_times, sizeof(float) * num_images));
+    checkCudaErrors(cudaMemcpy(device_times, image_data.imageTimes, sizeof(float) * num_images,
+                               cudaMemcpyHostToDevice));
+
+    // Allocate and copy the images.
+    checkCudaErrors(cudaMalloc((void **)&device_img, sizeof(float) * num_image_pixels));
+    float *next_ptr = device_img;
+    for (unsigned t = 0; t < num_images; ++t) {
+        const std::vector<float> &data_ref = stack.getSingleImage(t).getScience().getPixels();
+
+        assert(data_ref.size() == width * height);
+        checkCudaErrors(cudaMemcpy(next_ptr, data_ref.data(), sizeof(float) * width * height,
+                                   cudaMemcpyHostToDevice));
+        next_ptr += width * height;
+    }
+
+    // Allocate space for the results.
+    checkCudaErrors(cudaMalloc((void **)&device_res, sizeof(float) * num_stamp_pixels));
+
+    // Allocate memory for and copy barycentric corrections (if needed).
+    if (image_data.baryCorrs != nullptr) {
+        checkCudaErrors(cudaMalloc((void **)&deviceBaryCorrs, sizeof(baryCorrection) * num_images));
+        checkCudaErrors(cudaMemcpy(deviceBaryCorrs, image_data.baryCorrs, sizeof(baryCorrection) * num_images,
+                                   cudaMemcpyHostToDevice));
+    }
+
+    // Wrap the per-image data into a struct. This struct will be copied by value
+    // during the function call, so we don't need to allocate memory for the
+    // struct itself. We just set the pointers to the on device vectors.
+    perImageData device_image_data;
+    device_image_data.numImages = num_images;
+    device_image_data.imageTimes = device_times;
+    device_image_data.baryCorrs = deviceBaryCorrs;
+    device_image_data.psiParams = nullptr;
+    device_image_data.phiParams = nullptr;
+
+    dim3 blocks(num_trajectories, 1, 1);
+    dim3 threads(1, stamp_width, stamp_width);
+
+    // Create the stamps.
+    device_get_coadd_stamp<<<blocks, threads>>>(num_images, width, height, device_img, device_image_data,
+                                                num_trajectories, device_trjs, params, device_use_index,
+                                                device_res);
+    cudaDeviceSynchronize();
+
+    // Free up the unneeded memory (everything except for the on-device results).
+    if (deviceBaryCorrs != nullptr) checkCudaErrors(cudaFree(deviceBaryCorrs));
+    checkCudaErrors(cudaFree(device_img));
+    if (device_use_index != nullptr) checkCudaErrors(cudaFree(device_use_index));
+    checkCudaErrors(cudaFree(device_times));
+    checkCudaErrors(cudaFree(device_trjs));
+    cudaDeviceSynchronize();
+
+    // Read back results
+    checkCudaErrors(
+            cudaMemcpy(results, device_res, sizeof(float) * num_stamp_pixels, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+
+    // Free the rest of the  on GPU memory.
+    checkCudaErrors(cudaFree(device_res));
+}
+
 } /* namespace search */
 
 #endif /* KERNELS_CU_ */
diff --git a/src/kbmod/include/common.h b/src/kbmod/search/search_structs.h
similarity index 82%
rename from src/kbmod/include/common.h
rename to src/kbmod/search/search_structs.h
index 6e9ece5c..3698a733 100644
--- a/src/kbmod/include/common.h
+++ b/src/kbmod/search/search_structs.h
@@ -5,22 +5,13 @@
  *      Author: kbmod-usr
  */
 
-#ifndef COMMON_H_
-#define COMMON_H_
+#ifndef SEARCH_STRUCTS_
+#define SEARCH_STRUCTS_
 
-#ifdef HAVE_CUDA
-    constexpr bool HAVE_GPU = true;
-#else
-    constexpr bool HAVE_GPU = false;
-#endif
-
-constexpr unsigned int MAX_KERNEL_RADIUS = 15;
 constexpr unsigned short MAX_STAMP_EDGE = 64;
-constexpr unsigned short CONV_THREAD_DIM = 32;
 constexpr unsigned short THREAD_DIM_X = 128;
 constexpr unsigned short THREAD_DIM_Y = 2;
 constexpr unsigned short RESULTS_PER_PIXEL = 8;
-constexpr float NO_DATA = -9999.0;
 
 enum StampType { STAMP_SUM = 0, STAMP_MEAN, STAMP_MEDIAN };
 
@@ -43,12 +34,6 @@ struct trajectory {
     short obsCount;
 };
 
-// The position (in pixels) of a trajectory.
-struct pixelPos {
-    float x;
-    float y;
-};
-
 /*
  * Linear approximation to the barycentric correction needed to transform a
  * pixel in the first image to a pixel in a consequent image. One struct needed
@@ -130,14 +115,4 @@ struct stampParameters {
     float m20_limit;
 };
 
-// Basic image moments use for analysis.
-struct imageMoments {
-    float m00;
-    float m01;
-    float m10;
-    float m11;
-    float m02;
-    float m20;
-};
-
-#endif /* COMMON_H_ */
+#endif /* SEARCH_STRUCTS_ */