diff --git a/src/kbmod/search/kernels.cu b/src/kbmod/search/kernels.cu index 84b6cf24..4a798155 100644 --- a/src/kbmod/search/kernels.cu +++ b/src/kbmod/search/kernels.cu @@ -41,7 +41,7 @@ extern "C" void device_free_psi_phi_array(PsiPhiArray *data) { } } -__forceinline__ __device__ PsiPhi read_encoded_psi_phi(PsiPhiArrayMeta ¶ms, void *psi_phi_vect, int time, +__host__ __device__ PsiPhi read_encoded_psi_phi(PsiPhiArrayMeta ¶ms, void *psi_phi_vect, int time, int row, int col) { // Bounds checking. if ((row < 0) || (col < 0) || (row >= params.height) || (col >= params.width)) { @@ -122,6 +122,81 @@ extern "C" __device__ __host__ void SigmaGFilteredIndicesCU(float *values, int n *max_keep_idx = end - 1; } +/* + * Evaluate the likelihood score (as computed with from the psi and phi values) for a single + * given candidate trajectory. Modifies the trajectory in place to update the number of + * observations, likelihood, and flux. + */ +extern "C" __device__ __host__ void evaluateTrajectory(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_vect, float *image_times, + SearchParameters params, Trajectory *candidate) { + // Data structures used for filtering. We fill in only what we need. + float psi_array[MAX_NUM_IMAGES]; + float phi_array[MAX_NUM_IMAGES]; + float psi_sum = 0.0; + float phi_sum = 0.0; + + // Reset the statistics for the candidate. + candidate->obs_count = 0; + candidate->lh = -1.0; + candidate->flux = -1.0; + + // Loop over each image and sample the appropriate pixel + int num_seen = 0; + for (int i = 0; i < psi_phi_meta.num_times; ++i) { + // Predict the trajectory's position. + float curr_time = image_times[i]; + int current_x = candidate->x + int(candidate->vx * curr_time + 0.5); + int current_y = candidate->y + int(candidate->vy * curr_time + 0.5); + + // Get the Psi and Phi pixel values. + PsiPhi pixel_vals = read_encoded_psi_phi(psi_phi_meta, psi_phi_vect, i, current_y, current_x); + if (pixel_vals.psi != NO_DATA && pixel_vals.phi != NO_DATA) { + psi_sum += pixel_vals.psi; + phi_sum += pixel_vals.phi; + psi_array[num_seen] = pixel_vals.psi; + phi_array[num_seen] = pixel_vals.phi; + num_seen += 1; + } + } + candidate->obs_count = num_seen; + candidate->lh = psi_sum / sqrt(phi_sum); + candidate->flux = psi_sum / phi_sum; + + // If we do not have enough observations or a good enough LH score, + // do not bother with any of the following steps. + if ((candidate->obs_count < params.min_observations) || + (params.do_sigmag_filter && candidate->lh < params.min_lh)) + return; + + // If we are doing on GPU filtering, run the sigma_g filter and recompute the likelihoods. + if (params.do_sigmag_filter) { + // Fill in a likelihood and index array for sorting. + float lc_array[MAX_NUM_IMAGES]; + int idx_array[MAX_NUM_IMAGES]; + for (int i = 0; i < num_seen; ++i) { + lc_array[i] = (phi_array[i] != 0) ? (psi_array[i] / phi_array[i]) : 0; + idx_array[i] = i; + } + + int min_keep_idx = 0; + int max_keep_idx = num_seen - 1; + SigmaGFilteredIndicesCU(lc_array, num_seen, params.sgl_L, params.sgl_H, params.sigmag_coeff, 2.0, + idx_array, &min_keep_idx, &max_keep_idx); + + // Compute the likelihood and flux of the track based on the filtered + // observations (ones in [min_keep_idx, max_keep_idx]). + float new_psi_sum = 0.0; + float new_phi_sum = 0.0; + for (int i = min_keep_idx; i <= max_keep_idx; i++) { + int idx = idx_array[i]; + new_psi_sum += psi_array[idx]; + new_phi_sum += phi_array[idx]; + } + candidate->lh = new_psi_sum / sqrt(new_phi_sum); + candidate->flux = new_psi_sum / new_phi_sum; + } +} + /* * Searches through images (represented as a flat array of floats) looking for most likely * trajectories in the given list. Outputs a results image of best trajectories. Returns a @@ -148,12 +223,6 @@ __global__ void searchFilterImages(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_v const int x = x_i + params.x_start_min; const int y = y_i + params.y_start_min; - // Data structures used for filtering. - float lc_array[MAX_NUM_IMAGES]; - float psi_array[MAX_NUM_IMAGES]; - float phi_array[MAX_NUM_IMAGES]; - int idx_array[MAX_NUM_IMAGES]; - // Create an initial set of best results with likelihood -1.0. // We also set (x, y) because they are used in the later python // functions. @@ -174,67 +243,15 @@ __global__ void searchFilterImages(PsiPhiArrayMeta psi_phi_meta, void *psi_phi_v curr_trj.vy = trajectories[t].vy; curr_trj.obs_count = 0; - float psi_sum = 0.0; - float phi_sum = 0.0; - - // Loop over each image and sample the appropriate pixel - for (int i = 0; i < psi_phi_meta.num_times; ++i) { - lc_array[i] = 0; - psi_array[i] = 0; - phi_array[i] = 0; - idx_array[i] = i; - } - - // Loop over each image and sample the appropriate pixel - int num_seen = 0; - for (int i = 0; i < psi_phi_meta.num_times; ++i) { - // Predict the trajectory's position. - float curr_time = image_times[i]; - int current_x = x + int(curr_trj.vx * curr_time + 0.5); - int current_y = y + int(curr_trj.vy * curr_time + 0.5); - - // Get the Psi and Phi pixel values. - PsiPhi pixel_vals = read_encoded_psi_phi(psi_phi_meta, psi_phi_vect, i, current_y, current_x); - if (pixel_vals.psi != NO_DATA && pixel_vals.phi != NO_DATA) { - curr_trj.obs_count++; - psi_sum += pixel_vals.psi; - phi_sum += pixel_vals.phi; - psi_array[num_seen] = pixel_vals.psi; - phi_array[num_seen] = pixel_vals.phi; - if (pixel_vals.phi != 0.0) lc_array[num_seen] = pixel_vals.psi / pixel_vals.phi; - num_seen += 1; - } - } - curr_trj.lh = psi_sum / sqrt(phi_sum); - curr_trj.flux = psi_sum / phi_sum; + // Evaluate the trajectory. + evaluateTrajectory(psi_phi_meta, psi_phi_vect, image_times, params, &curr_trj); // If we do not have enough observations or a good enough LH score, - // do not bother with any of the following steps. + // do not bother inserting it into the sorted list of results. if ((curr_trj.obs_count < params.min_observations) || (params.do_sigmag_filter && curr_trj.lh < params.min_lh)) continue; - // If we are doing on GPU filtering, run the sigma_g filter - // and recompute the likelihoods. - if (params.do_sigmag_filter) { - int min_keep_idx = 0; - int max_keep_idx = num_seen - 1; - SigmaGFilteredIndicesCU(lc_array, num_seen, params.sgl_L, params.sgl_H, params.sigmag_coeff, 2.0, - idx_array, &min_keep_idx, &max_keep_idx); - - // Compute the likelihood and flux of the track based on the filtered - // observations (ones in [min_keep_idx, max_keep_idx]). - float new_psi_sum = 0.0; - float new_phi_sum = 0.0; - for (int i = min_keep_idx; i <= max_keep_idx; i++) { - int idx = idx_array[i]; - new_psi_sum += psi_array[idx]; - new_phi_sum += phi_array[idx]; - } - curr_trj.lh = new_psi_sum / sqrt(new_phi_sum); - curr_trj.flux = new_psi_sum / new_phi_sum; - } - // Insert the new trajectory into the sorted list of results. // Only sort the values with valid likelihoods. Trajectory temp; @@ -311,6 +328,7 @@ extern "C" void deviceSearchFilter(PsiPhiArray &psi_phi_array, float *image_time searchFilterImages<<>>(psi_phi_array.get_meta_data(), psi_phi_array.get_gpu_array_ptr(), device_img_times, params, num_trajectories, device_tests, device_search_results); + cudaDeviceSynchronize(); // Read back results checkCudaErrors(cudaMemcpy(best_results, device_search_results, sizeof(Trajectory) * num_results, @@ -502,107 +520,6 @@ void deviceGetCoadds(const unsigned int num_images, const unsigned int width, co checkCudaErrors(cudaFree(device_res)); } -/* - void deviceGetCoadds(ImageStack &stack, PerImageData image_data, int num_trajectories, - Trajectory *trajectories, StampParameters params, - std::vector> &use_index_vect, float *results) { - // Allocate Device memory - Trajectory *device_trjs; - int *device_use_index = nullptr; - float *device_times; - float *device_img; - float *device_res; - - // Compute the dimensions for the data. - const unsigned int num_images = stack.img_count(); - const unsigned int width = stack.get_width(); - const unsigned int height = stack.get_height(); - const unsigned int num_image_pixels = num_images * width * height; - const unsigned int stamp_width = 2 * params.radius + 1; - const unsigned int stamp_ppi = (2 * params.radius + 1) * (2 * params.radius + 1); - const unsigned int num_stamp_pixels = num_trajectories * stamp_ppi; - - // Allocate and copy the trajectories. - checkCudaErrors(cudaMalloc((void **)&device_trjs, sizeof(Trajectory) * num_trajectories)); - checkCudaErrors(cudaMemcpy(device_trjs, trajectories, sizeof(Trajectory) * num_trajectories, - cudaMemcpyHostToDevice)); - - // Check if we need to create a vector of per-trajectory, per-image use. - // Convert the vector of booleans into an integer array so we do a cudaMemcpy. - if (use_index_vect.size() == num_trajectories) { - checkCudaErrors(cudaMalloc((void **)&device_use_index, sizeof(int) * num_images * num_trajectories)); - - int *start_ptr = device_use_index; - std::vector int_vect(num_images, 0); - for (unsigned i = 0; i < num_trajectories; ++i) { - assert(use_index_vect[i].size() == num_images); - for (unsigned t = 0; t < num_images; ++t) { - int_vect[t] = use_index_vect[i][t] ? 1 : 0; - } - - checkCudaErrors( - cudaMemcpy(start_ptr, int_vect.data(), sizeof(int) * num_images, cudaMemcpyHostToDevice)); - start_ptr += num_images; - } - } - - // Allocate and copy the times. - checkCudaErrors(cudaMalloc((void **)&device_times, sizeof(float) * num_images)); - checkCudaErrors(cudaMemcpy(device_times, image_data.image_times, sizeof(float) * num_images, - cudaMemcpyHostToDevice)); - - // Allocate and copy the images. - checkCudaErrors(cudaMalloc((void **)&device_img, sizeof(float) * num_image_pixels)); - float *next_ptr = device_img; - for (unsigned t = 0; t < num_images; ++t) { - // Used to be a vector of floats, now is an eigen::vector of floats or something - // but that's ok because all we use it for is the .data() -> float* - // I think this can also just directly go to .data because of RowMajor layout - auto& data_ref = stack.get_single_image(t).get_science().get_image(); - - assert(data_ref.size() == width * height); - checkCudaErrors(cudaMemcpy(next_ptr, data_ref.data(), sizeof(float) * width * height, - cudaMemcpyHostToDevice)); - next_ptr += width * height; - } - - // Allocate space for the results. - checkCudaErrors(cudaMalloc((void **)&device_res, sizeof(float) * num_stamp_pixels)); - - // Wrap the per-image data into a struct. This struct will be copied by value - // during the function call, so we don't need to allocate memory for the - // struct itself. We just set the pointers to the on device vectors. - PerImageData device_image_data; - device_image_data.num_images = num_images; - device_image_data.image_times = device_times; - device_image_data.psi_params = nullptr; - device_image_data.phi_params = nullptr; - - dim3 blocks(num_trajectories, 1, 1); - dim3 threads(1, stamp_width, stamp_width); - - // Create the stamps. - deviceGetCoaddStamp<<>>(num_images, width, height, device_img, device_image_data, - num_trajectories, device_trjs, params, device_use_index, - device_res); - cudaDeviceSynchronize(); - - // Free up the unneeded memory (everything except for the on-device results). - checkCudaErrors(cudaFree(device_img)); - if (device_use_index != nullptr) checkCudaErrors(cudaFree(device_use_index)); - checkCudaErrors(cudaFree(device_times)); - checkCudaErrors(cudaFree(device_trjs)); - cudaDeviceSynchronize(); - - // Read back results - checkCudaErrors( - cudaMemcpy(results, device_res, sizeof(float) * num_stamp_pixels, cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); - - // Free the rest of the on GPU memory. - checkCudaErrors(cudaFree(device_res)); - } -*/ } /* namespace search */ #endif /* KERNELS_CU_ */