dirac-institute · jeremykubica · Feb 2, 2024 · Feb 1, 2024
diff --git a/src/kbmod/search/kernels.cu b/src/kbmod/search/kernels.cu
@@ -41,7 +41,7 @@ extern "C" void device_free_psi_phi_array(PsiPhiArray *data) {
     }
 }
 
-__forceinline__ __device__ PsiPhi read_encoded_psi_phi(PsiPhiArrayMeta &params, void *psi_phi_vect, int time,
+__host__ __device__ PsiPhi read_encoded_psi_phi(PsiPhiArrayMeta &params, void *psi_phi_vect, int time,
                                                        int row, int col) {
     // Bounds checking.
     if ((row < 0) || (col < 0) || (row >= params.height) || (col >= params.width)) {
@@ -122,6 +122,81 @@ extern "C" __device__ __host__ void SigmaGFilteredIndicesCU(float *values, int n
     *max_keep_idx = end - 1;
 }
 
+/*
+ * Evaluate the likelihood score (as computed with from the psi and phi values) for a single
+ * given candidate trajectory. Modifies the trajectory in place to update the number of 
+ * observations, likelihood, and flux.
+ */
+extern "C" __device__ __host__ void evaluateTrajectory(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_vect, float *image_times,
+                                                       SearchParameters params, Trajectory *candidate) {
+    // Data structures used for filtering. We fill in only what we need.
+    float psi_array[MAX_NUM_IMAGES];
+    float phi_array[MAX_NUM_IMAGES];
+    float psi_sum = 0.0;
+    float phi_sum = 0.0;
+
+    // Reset the statistics for the candidate.
+    candidate->obs_count = 0;
+    candidate->lh = -1.0;
+    candidate->flux = -1.0;
+
+    // Loop over each image and sample the appropriate pixel
+    int num_seen = 0;
+    for (int i = 0; i < psi_phi_meta.num_times; ++i) {
+        // Predict the trajectory's position.
+        float curr_time = image_times[i];
+        int current_x = candidate->x + int(candidate->vx * curr_time + 0.5);
+        int current_y = candidate->y + int(candidate->vy * curr_time + 0.5);
+
+        // Get the Psi and Phi pixel values.
+        PsiPhi pixel_vals = read_encoded_psi_phi(psi_phi_meta, psi_phi_vect, i, current_y, current_x);
+        if (pixel_vals.psi != NO_DATA && pixel_vals.phi != NO_DATA) {
+            psi_sum += pixel_vals.psi;
+            phi_sum += pixel_vals.phi;
+            psi_array[num_seen] = pixel_vals.psi;
+            phi_array[num_seen] = pixel_vals.phi;
+            num_seen += 1;
+        }
+    }
+    candidate->obs_count = num_seen;
+    candidate->lh = psi_sum / sqrt(phi_sum);
+    candidate->flux = psi_sum / phi_sum;
+
+    // If we do not have enough observations or a good enough LH score,
+    // do not bother with any of the following steps.
+    if ((candidate->obs_count < params.min_observations) ||
+        (params.do_sigmag_filter && candidate->lh < params.min_lh))
+        return;
+
+    // If we are doing on GPU filtering, run the sigma_g filter and recompute the likelihoods.
+    if (params.do_sigmag_filter) {
+        // Fill in a likelihood and index array for sorting.
+        float lc_array[MAX_NUM_IMAGES];
+        int idx_array[MAX_NUM_IMAGES];
+        for (int i = 0; i < num_seen; ++i) {
+            lc_array[i] = (phi_array[i] != 0) ? (psi_array[i] / phi_array[i]) : 0;
+            idx_array[i] = i;
+        }
+
+        int min_keep_idx = 0;
+        int max_keep_idx = num_seen - 1;
+        SigmaGFilteredIndicesCU(lc_array, num_seen, params.sgl_L, params.sgl_H, params.sigmag_coeff, 2.0,
+                                idx_array, &min_keep_idx, &max_keep_idx);
+
+        // Compute the likelihood and flux of the track based on the filtered
+        // observations (ones in [min_keep_idx, max_keep_idx]).
+        float new_psi_sum = 0.0;
+        float new_phi_sum = 0.0;
+        for (int i = min_keep_idx; i <= max_keep_idx; i++) {
+            int idx = idx_array[i];
+            new_psi_sum += psi_array[idx];
+            new_phi_sum += phi_array[idx];
+        }
+        candidate->lh = new_psi_sum / sqrt(new_phi_sum);
+        candidate->flux = new_psi_sum / new_phi_sum;
+    }
+}
+
 /*
  * Searches through images (represented as a flat array of floats) looking for most likely
  * trajectories in the given list. Outputs a results image of best trajectories. Returns a
@@ -148,12 +223,6 @@ __global__ void searchFilterImages(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_v
     const int x = x_i + params.x_start_min;
     const int y = y_i + params.y_start_min;
 
-    // Data structures used for filtering.
-    float lc_array[MAX_NUM_IMAGES];
-    float psi_array[MAX_NUM_IMAGES];
-    float phi_array[MAX_NUM_IMAGES];
-    int idx_array[MAX_NUM_IMAGES];
-
     // Create an initial set of best results with likelihood -1.0.
     // We also set (x, y) because they are used in the later python
     // functions.
@@ -174,67 +243,15 @@ __global__ void searchFilterImages(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_v
         curr_trj.vy = trajectories[t].vy;
         curr_trj.obs_count = 0;
 
-        float psi_sum = 0.0;
-        float phi_sum = 0.0;
-
-        // Loop over each image and sample the appropriate pixel
-        for (int i = 0; i < psi_phi_meta.num_times; ++i) {
-            lc_array[i] = 0;
-            psi_array[i] = 0;
-            phi_array[i] = 0;
-            idx_array[i] = i;
-        }
-
-        // Loop over each image and sample the appropriate pixel
-        int num_seen = 0;
-        for (int i = 0; i < psi_phi_meta.num_times; ++i) {
-            // Predict the trajectory's position.
-            float curr_time = image_times[i];
-            int current_x = x + int(curr_trj.vx * curr_time + 0.5);
-            int current_y = y + int(curr_trj.vy * curr_time + 0.5);
-
-            // Get the Psi and Phi pixel values.
-            PsiPhi pixel_vals = read_encoded_psi_phi(psi_phi_meta, psi_phi_vect, i, current_y, current_x);
-            if (pixel_vals.psi != NO_DATA && pixel_vals.phi != NO_DATA) {
-                curr_trj.obs_count++;
-                psi_sum += pixel_vals.psi;
-                phi_sum += pixel_vals.phi;
-                psi_array[num_seen] = pixel_vals.psi;
-                phi_array[num_seen] = pixel_vals.phi;
-                if (pixel_vals.phi != 0.0) lc_array[num_seen] = pixel_vals.psi / pixel_vals.phi;
-                num_seen += 1;
-            }
-        }
-        curr_trj.lh = psi_sum / sqrt(phi_sum);
-        curr_trj.flux = psi_sum / phi_sum;
+        // Evaluate the trajectory.
+        evaluateTrajectory(psi_phi_meta, psi_phi_vect, image_times, params, &curr_trj);
 
         // If we do not have enough observations or a good enough LH score,
-        // do not bother with any of the following steps.
+        // do not bother inserting it into the sorted list of results.
         if ((curr_trj.obs_count < params.min_observations) ||
             (params.do_sigmag_filter && curr_trj.lh < params.min_lh))
             continue;
 
-        // If we are doing on GPU filtering, run the sigma_g filter
-        // and recompute the likelihoods.
-        if (params.do_sigmag_filter) {
-            int min_keep_idx = 0;
-            int max_keep_idx = num_seen - 1;
-            SigmaGFilteredIndicesCU(lc_array, num_seen, params.sgl_L, params.sgl_H, params.sigmag_coeff, 2.0,
-                                    idx_array, &min_keep_idx, &max_keep_idx);
-
-            // Compute the likelihood and flux of the track based on the filtered
-            // observations (ones in [min_keep_idx, max_keep_idx]).
-            float new_psi_sum = 0.0;
-            float new_phi_sum = 0.0;
-            for (int i = min_keep_idx; i <= max_keep_idx; i++) {
-                int idx = idx_array[i];
-                new_psi_sum += psi_array[idx];
-                new_phi_sum += phi_array[idx];
-            }
-            curr_trj.lh = new_psi_sum / sqrt(new_phi_sum);
-            curr_trj.flux = new_psi_sum / new_phi_sum;
-        }
-
         // Insert the new trajectory into the sorted list of results.
         // Only sort the values with valid likelihoods.
         Trajectory temp;
@@ -311,6 +328,7 @@ extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, float *image_time
     searchFilterImages<<<blocks, threads>>>(psi_phi_array.get_meta_data(), psi_phi_array.get_gpu_array_ptr(),
                                             device_img_times, params, num_trajectories, device_tests,
                                             device_search_results);
+    cudaDeviceSynchronize();
 
     // Read back results
     checkCudaErrors(cudaMemcpy(best_results, device_search_results, sizeof(Trajectory) * num_results,
@@ -502,107 +520,6 @@ void deviceGetCoadds(const unsigned int num_images, const unsigned int width, co
     checkCudaErrors(cudaFree(device_res));
 }
 
-/*
-    void deviceGetCoadds(ImageStack &stack, PerImageData image_data, int num_trajectories,
-    Trajectory *trajectories, StampParameters params,
-    std::vector<std::vector<bool>> &use_index_vect, float *results) {
-    // Allocate Device memory
-    Trajectory *device_trjs;
-    int *device_use_index = nullptr;
-    float *device_times;
-    float *device_img;
-    float *device_res;
-
-    // Compute the dimensions for the data.
-    const unsigned int num_images = stack.img_count();
-    const unsigned int width = stack.get_width();
-    const unsigned int height = stack.get_height();
-    const unsigned int num_image_pixels = num_images * width * height;
-    const unsigned int stamp_width = 2 * params.radius + 1;
-    const unsigned int stamp_ppi = (2 * params.radius + 1) * (2 * params.radius + 1);
-    const unsigned int num_stamp_pixels = num_trajectories * stamp_ppi;
-
-    // Allocate and copy the trajectories.
-    checkCudaErrors(cudaMalloc((void **)&device_trjs, sizeof(Trajectory) * num_trajectories));
-    checkCudaErrors(cudaMemcpy(device_trjs, trajectories, sizeof(Trajectory) * num_trajectories,
-    cudaMemcpyHostToDevice));
-
-    // Check if we need to create a vector of per-trajectory, per-image use.
-    // Convert the vector of booleans into an integer array so we do a cudaMemcpy.
-    if (use_index_vect.size() == num_trajectories) {
-    checkCudaErrors(cudaMalloc((void **)&device_use_index, sizeof(int) * num_images * num_trajectories));
-
-    int *start_ptr = device_use_index;
-    std::vector<int> int_vect(num_images, 0);
-    for (unsigned i = 0; i < num_trajectories; ++i) {
-    assert(use_index_vect[i].size() == num_images);
-    for (unsigned t = 0; t < num_images; ++t) {
-    int_vect[t] = use_index_vect[i][t] ? 1 : 0;
-    }
-
-    checkCudaErrors(
-    cudaMemcpy(start_ptr, int_vect.data(), sizeof(int) * num_images, cudaMemcpyHostToDevice));
-    start_ptr += num_images;
-    }
-    }
-
-    // Allocate and copy the times.
-    checkCudaErrors(cudaMalloc((void **)&device_times, sizeof(float) * num_images));
-    checkCudaErrors(cudaMemcpy(device_times, image_data.image_times, sizeof(float) * num_images,
-    cudaMemcpyHostToDevice));
-
-    // Allocate and copy the images.
-    checkCudaErrors(cudaMalloc((void **)&device_img, sizeof(float) * num_image_pixels));
-    float *next_ptr = device_img;
-    for (unsigned t = 0; t < num_images; ++t) {
-    // Used to be a vector of floats, now is an eigen::vector of floats or something
-    // but that's ok because all we use it for is the .data() -> float*
-    // I think this can also just directly go to .data because of RowMajor layout
-    auto& data_ref = stack.get_single_image(t).get_science().get_image();
-
-    assert(data_ref.size() == width * height);
-    checkCudaErrors(cudaMemcpy(next_ptr, data_ref.data(), sizeof(float) * width * height,
-    cudaMemcpyHostToDevice));
-    next_ptr += width * height;
-    }
-
-    // Allocate space for the results.
-    checkCudaErrors(cudaMalloc((void **)&device_res, sizeof(float) * num_stamp_pixels));
-
-    // Wrap the per-image data into a struct. This struct will be copied by value
-    // during the function call, so we don't need to allocate memory for the
-    // struct itself. We just set the pointers to the on device vectors.
-    PerImageData device_image_data;
-    device_image_data.num_images = num_images;
-    device_image_data.image_times = device_times;
-    device_image_data.psi_params = nullptr;
-    device_image_data.phi_params = nullptr;
-
-    dim3 blocks(num_trajectories, 1, 1);
-    dim3 threads(1, stamp_width, stamp_width);
-
-    // Create the stamps.
-    deviceGetCoaddStamp<<<blocks, threads>>>(num_images, width, height, device_img, device_image_data,
-    num_trajectories, device_trjs, params, device_use_index,
-    device_res);
-    cudaDeviceSynchronize();
-
-    // Free up the unneeded memory (everything except for the on-device results).
-    checkCudaErrors(cudaFree(device_img));
-    if (device_use_index != nullptr) checkCudaErrors(cudaFree(device_use_index));
-    checkCudaErrors(cudaFree(device_times));
-    checkCudaErrors(cudaFree(device_trjs));
-    cudaDeviceSynchronize();
-
-    // Read back results
-    checkCudaErrors(
-    cudaMemcpy(results, device_res, sizeof(float) * num_stamp_pixels, cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
-
-    // Free the rest of the  on GPU memory.
-    checkCudaErrors(cudaFree(device_res));
-    }
-*/
 } /* namespace search */
 
 #endif /* KERNELS_CU_ */