diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp index 9e7e9d5cc..27214204a 100644 --- a/src/grid/grid3D.cpp +++ b/src/grid/grid3D.cpp @@ -8,8 +8,9 @@ #endif #include "../global/global.h" #include "../grid/grid3D.h" -#include "../grid/grid_enum.h" // provides grid_enum -#include "../hydro/hydro_cuda.h" // provides Calc_dt_GPU +#include "../grid/grid_enum.h" // provides grid_enum +#include "../hydro/average_cells.h" // provides Average_Slow_Cells and SlowCellConditionChecker +#include "../hydro/hydro_cuda.h" // provides Calc_dt_GPU #include "../integrators/VL_1D_cuda.h" #include "../integrators/VL_2D_cuda.h" #include "../integrators/VL_3D_cuda.h" @@ -17,6 +18,7 @@ #include "../integrators/simple_2D_cuda.h" #include "../integrators/simple_3D_cuda.h" #include "../io/io.h" +#include "../utils/DeviceVector.h" #include "../utils/error_handling.h" #ifdef MPI_CHOLLA #include @@ -152,7 +154,9 @@ void Grid3D::Initialize(struct Parameters *P) #ifdef AVERAGE_SLOW_CELLS H.min_dt_slow = 1e-100; // Initialize the minumum dt to a tiny number -#endif // AVERAGE_SLOW_CELLS +#else + H.min_dt_slow = -1.0; +#endif // AVERAGE_SLOW_CELLS #ifndef MPI_CHOLLA @@ -422,6 +426,9 @@ void Grid3D::Execute_Hydro_Integrator(void) Timer.Hydro_Integrator.Start(); #endif // CPU_TIME + // this buffer holds 1 element that is initialized to 0 + cuda_utilities::DeviceVector error_code_buffer(1, true); + // Run the hydro integrator on the grid if (H.nx > 1 && H.ny == 1 && H.nz == 1) // 1D { @@ -446,18 +453,24 @@ void Grid3D::Execute_Hydro_Integrator(void) #ifdef VL VL_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy, H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor, - C.Grav_potential); + C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz), + error_code_buffer.data()); #endif // VL #ifdef SIMPLE Simple_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy, H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor, - C.Grav_potential); + C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz), + error_code_buffer.data()); #endif // SIMPLE } else { chprintf("Error: Grid dimensions nx: %d ny: %d nz: %d not supported.\n", H.nx, H.ny, H.nz); chexit(-1); } + if (error_code_buffer[0] != 0) { + CHOLLA_ERROR("An error occurred during the hydro calculation."); + } + #ifdef CPU_TIME Timer.Hydro_Integrator.End(true); #endif // CPU_TIME @@ -578,8 +591,9 @@ Real Grid3D::Update_Hydro_Grid() ny_off = ny_local_start; nz_off = nz_local_start; #endif - Average_Slow_Cells(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dx, H.dy, H.dz, gama, max_dti_slow, H.xbound, - H.ybound, H.zbound, nx_off, ny_off, nz_off); + Average_Slow_Cells(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, gama, + SlowCellConditionChecker(max_dti_slow, H.dx, H.dy, H.dz), H.xbound, H.ybound, H.zbound, nx_off, + ny_off, nz_off); #endif // AVERAGE_SLOW_CELLS // ==Calculate the next time step using Calc_dt_GPU from hydro/hydro_cuda.h== diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h index cb4c0dbbb..1b417afa8 100644 --- a/src/grid/grid3D.h +++ b/src/grid/grid3D.h @@ -210,9 +210,10 @@ struct Header { * \brief Length of the current timestep */ Real dt; -#ifdef AVERAGE_SLOW_CELLS + /*! \brief Cells that introduce timesteps shorter than will be averaged with + * neighboring cells. Should be a negative value when the + * AVERAGE_SLOW_CELLS macro isn't defined. */ Real min_dt_slow; -#endif /*! \var t_wall * \brief Wall time */ diff --git a/src/hydro/average_cells.h b/src/hydro/average_cells.h new file mode 100644 index 000000000..9f49f95f6 --- /dev/null +++ b/src/hydro/average_cells.h @@ -0,0 +1,56 @@ +/*! \file average_cells.h + * \brief Definitions of functions and classes that implement logic related to averaging cells with + * neighbors. */ + +#ifndef AVERAGE_CELLS_H +#define AVERAGE_CELLS_H + +#include + +#include "../global/global.h" + +/*! \brief Object that checks whether a given cell meets the conditions for slow-cell averaging. + * The main motivation for creating this class is reducing ifdef statements (and allow to modify the + * actual slow-cell-condition. */ +struct SlowCellConditionChecker { +// omit data-members if they aren't used for anything +#ifdef AVERAGE_SLOW_CELLS + Real max_dti_slow, dx, dy, dz; +#endif + + /*! \brief Construct a new object. */ + __host__ __device__ SlowCellConditionChecker(Real max_dti_slow, Real dx, Real dy, Real dz) +#ifdef AVERAGE_SLOW_CELLS + : max_dti_slow{max_dti_slow}, dx{dx}, dy{dy}, dz{dz} +#endif + { + } + + /*! \brief Returns whether the cell meets the condition for being considered a slow cell that must + * be averaged. */ + template + __device__ bool is_slow(Real total_energy, Real density, Real density_inv, Real velocity_x, Real velocity_y, + Real velocity_z, Real gamma) const + { + return this->max_dti_if_slow(total_energy, density, density_inv, velocity_x, velocity_y, velocity_z, gamma) > 0.0; + } + + /*! \brief Returns the max inverse timestep of the specified cell, if it meets the criteria for being + * a slow cell. If it doesn't, return a negative value instead. + */ + __device__ Real max_dti_if_slow(Real total_energy, Real density, Real density_inv, Real velocity_x, Real velocity_y, + Real velocity_z, Real gamma) const; +}; + +#ifdef AVERAGE_SLOW_CELLS + +void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real gamma, + SlowCellConditionChecker slow_check, Real xbound, Real ybound, Real zbound, int nx_offset, + int ny_offset, int nz_offset); + +__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, + Real gamma, SlowCellConditionChecker slow_check, Real xbound, Real ybound, + Real zbound, int nx_offset, int ny_offset, int nz_offset); +#endif + +#endif /* AVERAGE_CELLS_H */ \ No newline at end of file diff --git a/src/hydro/hydro_cuda.cu b/src/hydro/hydro_cuda.cu index 1b223708b..a15173608 100644 --- a/src/hydro/hydro_cuda.cu +++ b/src/hydro/hydro_cuda.cu @@ -10,6 +10,7 @@ #include "../global/global.h" #include "../global/global_cuda.h" #include "../gravity/static_grav.h" +#include "../hydro/average_cells.h" #include "../hydro/hydro_cuda.h" #include "../utils/DeviceVector.h" #include "../utils/cuda_utilities.h" @@ -377,14 +378,50 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R 0.5 * dt * (gx * (d * vx + d_n * vx_n) + gy * (d * vy + d_n * vy_n) + gz * (d * vz + d_n * vz_n)); #endif // GRAVITY + } +} + +/*! Returns whether a cell has crashed + * + * \note + * It probably won't come up, but it's unclear why we don't consider a density of 0 or + * energy of 0 to be crashed... (I'm just keeping the logic consistent with what it used to be) + */ +__device__ bool Cell_Is_Crashed(Real density, Real Etot_density) +{ + return (density < 0.0) || (density != density) || (Etot_density < 0.0) || (Etot_density != Etot_density); +} +__global__ void PostUpdate_Conserved_Correct_Crashed_3D(Real *dev_conserved, int nx, int ny, int nz, int x_off, + int y_off, int z_off, int n_ghost, Real gamma, int n_fields, + SlowCellConditionChecker slow_check, int *any_error) +{ + int n_cells = nx * ny * nz; + + // get a global thread ID + int id = threadIdx.x + blockIdx.x * blockDim.x; + int zid = id / (nx * ny); + int yid = (id - zid * nx * ny) / nx; + int xid = id - zid * nx * ny - yid * nx; + + if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 && + zid < nz - n_ghost) { + // threads corresponding to real cells do the calculation + + // this logic get's skipped if we apply both a density floor and temperature floor + // (we apply the logic if we only use one kind of floor or we use neither kind) #if !(defined(DENSITY_FLOOR) && defined(TEMPERATURE_FLOOR)) - if (dev_conserved[id] < 0.0 || dev_conserved[id] != dev_conserved[id] || dev_conserved[4 * n_cells + id] < 0.0 || - dev_conserved[4 * n_cells + id] != dev_conserved[4 * n_cells + id]) { - printf("%3d %3d %3d Thread crashed in final update. %e %e %e %e %e\n", xid + x_off, yid + y_off, zid + z_off, - dev_conserved[id], dtodx * (dev_F_x[imo] - dev_F_x[id]), dtody * (dev_F_y[jmo] - dev_F_y[id]), - dtodz * (dev_F_z[kmo] - dev_F_z[id]), dev_conserved[4 * n_cells + id]); - Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved); + if (Cell_Is_Crashed(dev_conserved[grid_enum::density * n_cells + id], + dev_conserved[grid_enum::Energy * n_cells + id])) { + printf("%3d %3d %3d Thread crashed in final update. %e - - - %e\n", xid + x_off, yid + y_off, zid + z_off, + dev_conserved[grid_enum::density * n_cells + id], dev_conserved[grid_enum::Energy * n_cells + id]); + bool success = Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved, + n_ghost, slow_check); + if (!success) { + printf("%3d %3d %3d there was an issue with averaging the neighboring cells\n", xid + x_off, yid + y_off, + zid + z_off); + *any_error = 1; + } } #endif // DENSITY_FLOOR /* @@ -400,7 +437,6 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R */ } } - __device__ __host__ Real hydroInverseCrossingTime(Real const &E, Real const &d, Real const &d_inv, Real const &vx, Real const &vy, Real const &vz, Real const &dx, Real const &dy, Real const &dz, Real const &gamma) @@ -667,10 +703,23 @@ void Temperature_Ceiling(Real *dev_conserved, int nx, int ny, int nz, int n_ghos } } +__device__ Real SlowCellConditionChecker::max_dti_if_slow(Real total_energy, Real density, Real density_inv, + Real velocity_x, Real velocity_y, Real velocity_z, + Real gamma) const +{ +#ifndef AVERAGE_SLOW_CELLS + return -1.0; +#else + Real max_dti = hydroInverseCrossingTime(total_energy, density, density_inv, velocity_x, velocity_y, velocity_z, + this->dx, this->dy, this->dz, gamma); + return (max_dti > this->max_dti_slow) ? max_dti : -1.0; +#endif +} + #ifdef AVERAGE_SLOW_CELLS -void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy, - Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, Real zbound, int nx_offset, +void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real gamma, + SlowCellConditionChecker slow_check, Real xbound, Real ybound, Real zbound, int nx_offset, int ny_offset, int nz_offset) { // set values for GPU kernels @@ -683,12 +732,12 @@ void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost if (nx > 1 && ny > 1 && nz > 1) { // 3D hipLaunchKernelGGL(Average_Slow_Cells_3D, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, - dx, dy, dz, gamma, max_dti_slow, xbound, ybound, zbound, nx_offset, ny_offset, nz_offset); + gamma, slow_check, xbound, ybound, zbound, nx_offset, ny_offset, nz_offset); } } -__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, - Real dy, Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, +__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, + Real gamma, SlowCellConditionChecker slow_check, Real xbound, Real ybound, Real zbound, int nx_offset, int ny_offset, int nz_offset) { int id, xid, yid, zid, n_cells; @@ -711,25 +760,26 @@ __global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int n vz = dev_conserved[3 * n_cells + id] * d_inv; E = dev_conserved[4 * n_cells + id]; - // Compute the maximum inverse crossing time in the cell - max_dti = hydroInverseCrossingTime(E, d, d_inv, vx, vy, vz, dx, dy, dz, gamma); + // retrieve the max inverse crossing time in the cell if the cell meets the threshold for being a slow-cell. + // (if the cell doesn't meet the threshold, a negative value is returned instead) + max_dti = slow_check.max_dti_if_slow(E, d, d_inv, vx, vy, vz, gamma); - if (max_dti > max_dti_slow) { + if (max_dti > 0) { speed = sqrt(vx * vx + vy * vy + vz * vz); temp = (gamma - 1) * (E - 0.5 * (speed * speed) * d) * ENERGY_UNIT / (d * DENSITY_UNIT / 0.6 / MP) / KB; P = (E - 0.5 * d * (vx * vx + vy * vy + vz * vz)) * (gamma - 1.0); cs = sqrt(d_inv * gamma * P) * VELOCITY_UNIT * 1e-5; - Real x = xbound + (nx_offset + xid - n_ghost + 0.5) * dx; - Real y = ybound + (ny_offset + yid - n_ghost + 0.5) * dy; - Real z = zbound + (nz_offset + zid - n_ghost + 0.5) * dz; + Real x = xbound + (nx_offset + xid - n_ghost + 0.5) * slow_check.dx; + Real y = ybound + (ny_offset + yid - n_ghost + 0.5) * slow_check.dy; + Real z = zbound + (nz_offset + zid - n_ghost + 0.5) * slow_check.dz; // Average this cell kernel_printf( " Average Slow Cell [ %.5e %.5e %.5e ] -> dt_cell=%f dt_min=%f, n=%.3e, " "T=%.3e, v=%.3e (%.3e, %.3e, %.3e), cs=%.3e\n", - x, y, z, 1. / max_dti, 1. / max_dti_slow, dev_conserved[id] * DENSITY_UNIT / 0.6 / MP, temp, + x, y, z, 1. / max_dti, 1. / slow_check.max_dti_slow, dev_conserved[id] * DENSITY_UNIT / 0.6 / MP, temp, speed * VELOCITY_UNIT * 1e-5, vx * VELOCITY_UNIT * 1e-5, vy * VELOCITY_UNIT * 1e-5, vz * VELOCITY_UNIT * 1e-5, cs); - Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved); + Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved, n_ghost, slow_check); } } } @@ -908,6 +958,20 @@ __global__ void Partial_Update_Advected_Internal_Energy_3D(Real *dev_conserved, } } +/*! The folliowing function is used to retrieve the total energy density if the cell isn't crashed. + * If the cell is returned, 0 is returned + * + * \note + * This is useful for implementing `Select_Internal_Energy_ND` + */ +__device__ Real E_If_Not_Crashed(Real *dev_conserved, int n_cells, int spatial_idx) +{ + Real d = dev_conserved[grid_enum::density * n_cells + spatial_idx]; + Real E = dev_conserved[grid_enum::Energy * n_cells + spatial_idx]; + // NOTE: don't try to get clever here (we explicitly want to return 0 in cases where E is NaN) + return Cell_Is_Crashed(d, E) ? 0 : E; +} + __global__ void Select_Internal_Energy_1D(Real *dev_conserved, int nx, int n_ghost, int n_fields) { int id, xid, n_cells; @@ -937,14 +1001,25 @@ __global__ void Select_Internal_Energy_1D(Real *dev_conserved, int nx, int n_gho U_advected = dev_conserved[(n_fields - 1) * n_cells + id]; U_total = E - 0.5 * d * (vx * vx + vy * vy + vz * vz); - // find the max nearby total energy - Emax = fmax(dev_conserved[4 * n_cells + imo], E); - Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]); + // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D + // simulation always uses floors for this purpose) + if (Cell_Is_Crashed(d, E)) return; + + // find the max nearby total energy (from the local cell and any uncrashed neighbors) + // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok + // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0, + // instead of the neighboring cell's E if the neighbor crashed + Emax = E; + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo)); - // We only use the "advected" internal energy if both: + // Ordinarily, we only use the "advected" internal energy if both: // - the thermal energy divided by total energy is a small fraction (smaller than eta_1) // - AND we aren't masking shock heating (details controlled by Emax & eta_2) - if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) { + // We ALSO explicitly use the "advected" internal energy if the total energy is positive but doesn't + // exceed kinetic energy (i.e. U_total <= 0). + bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2); + if (prefer_U_total and (U_total > 0)) { U = U_total; } else { U = U_advected; @@ -997,16 +1072,27 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i U_advected = dev_conserved[(n_fields - 1) * n_cells + id]; U_total = E - 0.5 * d * (vx * vx + vy * vy + vz * vz); - // find the max nearby total energy - Emax = fmax(dev_conserved[4 * n_cells + imo], E); - Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + jmo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + jpo]); - - // We only use the "advected" internal energy if both: + // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D + // simulation always uses floors for this purpose) + if (Cell_Is_Crashed(d, E)) return; + + // find the max nearby total energy (from the local cell and any uncrashed neighbors) + // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok + // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0, + // instead of the neighboring cell's E if the neighbor crashed + Emax = E; + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jmo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jpo)); + + // We only use the "advected" internal energy if the following 3 conditions are satisfied: // - the thermal energy divided by total energy is a small fraction (smaller than eta_1) // - AND we aren't masking shock heating (details controlled by Emax & eta_2) - if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) { + // We ALSO explicitly use the "advected" internal energy if the total energy is positive but doesn't + // exceed kinetic energy (i.e. U_total <= 0). + bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2); + if (prefer_U_total and (U_total > 0)) { U = U_total; } else { U = U_advected; @@ -1024,6 +1110,15 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields) { + // The scenario where E > 0 and E doesn't exceed the kinetic energy (i.e. U_total <= 0), + // comes up with some frequency when we include particle feedback. + // -> in that scenario, we explicitly use the value held by the U_advected field + // -> the separate `Sync_Energies_3D` kernel, we then override the E field with KE + U_advected + // -> one might argue we should actually override the E field with KE + U_advected before this kernel + // (we leave that for future consideration) + // -> regardless, it is **REALLY IMPORTANT** that the E field is **NOT** modified in this kernel + // (modifying it will produce race conditions!!!) + int id, xid, yid, zid, n_cells; Real d, d_inv, vx, vy, vz, E, U_total, U_advected, U, Emax; int imo, ipo, jmo, jpo, kmo, kpo; @@ -1064,18 +1159,30 @@ __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, i U_advected = dev_conserved[(n_fields - 1) * n_cells + id]; U_total = E - 0.5 * d * (vx * vx + vy * vy + vz * vz); - // find the max nearby total energy - Emax = fmax(dev_conserved[4 * n_cells + imo], E); - Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + jmo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + jpo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + kmo]); - Emax = fmax(Emax, dev_conserved[4 * n_cells + kpo]); - - // We only use the "advected" internal energy if both: + // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D + // simulation always uses floors for this purpose) + if (Cell_Is_Crashed(d, E)) return; + + // find the max nearby total energy (from the local cell and any uncrashed neighbors) + // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok + // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0, + // instead of the neighboring cell's E if the neighbor crashed + Emax = E; + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jmo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jpo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, kmo)); + Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, kpo)); + + // Ordinarily, we only use the "advected" internal energy if both: // - the thermal energy divided by total energy is a small fraction (smaller than eta_1) // - AND we aren't masking shock heating (details controlled by Emax & eta_2) - if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) { + // We also explicitly use the "advected" internal energy if the total energy is positive but doesn't + // exceed kinetic energy (i.e. U_total <= 0). This scenario comes up in simulations with particle-based + // feedback. + bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2); + if (prefer_U_total and (U_total > 0)) { U = U_total; } else { U = U_advected; @@ -1194,7 +1301,7 @@ __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, in Real U_floor) { int id, xid, yid, zid, n_cells; - Real d, d_inv, vx, vy, vz, E, Ekin, U; + Real d, d_inv, vx, vy, vz, E, Ekin, Udens; n_cells = nx * ny * nz; // get a global thread ID @@ -1214,17 +1321,28 @@ __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, in E = dev_conserved[4 * n_cells + id]; Ekin = 0.5 * d * (vx * vx + vy * vy + vz * vz); - U = (E - Ekin) / d; - if (U < U_floor) { + int num_applications = 0; + + Udens = (E - Ekin); + if (Udens / d < U_floor) { + num_applications++; dev_conserved[4 * n_cells + id] = Ekin + d * U_floor; } #ifdef DE - U = dev_conserved[(n_fields - 1) * n_cells + id] / d; - if (U < U_floor) { + Udens = dev_conserved[(n_fields - 1) * n_cells + id]; + if (Udens / d < U_floor) { + num_applications++; dev_conserved[(n_fields - 1) * n_cells + id] = d * U_floor; } +#else + Udens = -123456789; // set to a dumb-looking number so that it's clear that it's not real when printing it #endif + + if (num_applications > 0) { + printf("T_Floor %3d %3d %3d d: %e spec_Ekin:%e spec_eint_floor: %e BC: E_dens:%e GasEnergy_dens:%e\n", xid, yid, + zid, d, Ekin, U_floor, E, Udens); + } } } @@ -1252,22 +1370,37 @@ __device__ Real Average_Cell_Single_Field(int field_indx, int i, int j, int k, i return v_avrg; } -__device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields, - Real gamma, Real *conserved) +__device__ bool Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields, + Real gamma, Real *conserved, int stale_depth, + SlowCellConditionChecker slow_check) { int id = i + (j)*nx + (k)*nx * ny; - Real d, mx, my, mz, E, P; - d = conserved[grid_enum::density * ncells + id]; - mx = conserved[grid_enum::momentum_x * ncells + id]; - my = conserved[grid_enum::momentum_y * ncells + id]; - mz = conserved[grid_enum::momentum_z * ncells + id]; - E = conserved[grid_enum::Energy * ncells + id]; - P = (E - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0); + // print out the the values stored in the cell before the correction + // -> we put this logic (and variable declarations in its own scope) to force us to redeclare variables later + // (this lets the compiler help us avoid silly errors where we forget to load a variable) + { + Real d, mx, my, mz, E, P, Udens; + d = conserved[grid_enum::density * ncells + id]; + mx = conserved[grid_enum::momentum_x * ncells + id]; + my = conserved[grid_enum::momentum_y * ncells + id]; + mz = conserved[grid_enum::momentum_z * ncells + id]; + E = conserved[grid_enum::Energy * ncells + id]; + P = (E - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0); - printf("%3d %3d %3d BC: d: %e E:%e P:%e vx:%e vy:%e vz:%e\n", i, j, k, d, E, P, mx / d, my / d, mz / d); +#ifdef DE + Udens = conserved[grid_enum::GasEnergy * ncells + id]; + Real kinetic_energy = hydro_utilities::Calc_Kinetic_Energy_From_Momentum(d, mx, my, mz); + P = hydro_utilities::Get_Pressure_From_DE(E, E - kinetic_energy, Udens, gamma); +#else + Udens = -123456789; // set to a dumb-looking number so that it's clear that it's not real when printing it + P = hydro_utilities::Calc_Pressure_Conserved(E, d, mx, my, mz, gamma); +#endif + printf("%3d %3d %3d BC: d: %e E:%e P:%e vx:%e vy:%e vz:%e Uadv:%e\n", i, j, k, d, E, P, mx / d, my / d, + mz / d, Udens); + } - int idn; + // initialize the variables that we use to accumulate information about neighboring values int N = 0; Real d_av, vx_av, vy_av, vz_av, P_av; d_av = vx_av = vy_av = vz_av = P_av = 0.0; @@ -1281,18 +1414,36 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int for (int kk = k - 1; kk <= k + 1; kk++) { for (int jj = j - 1; jj <= j + 1; jj++) { for (int ii = i - 1; ii <= i + 1; ii++) { - idn = ii + jj * nx + kk * nx * ny; - d = conserved[grid_enum::density * ncells + idn]; - mx = conserved[grid_enum::momentum_x * ncells + idn]; - my = conserved[grid_enum::momentum_y * ncells + idn]; - mz = conserved[grid_enum::momentum_z * ncells + idn]; - P = (conserved[grid_enum::Energy * ncells + idn] - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0); + if (ii <= stale_depth - 1 || ii >= nx - stale_depth || jj <= stale_depth - 1 || jj >= ny - stale_depth || + kk <= stale_depth - 1 || kk >= nz - stale_depth) { + continue; + } + + int idn = ii + jj * nx + kk * nx * ny; + Real d = conserved[grid_enum::density * ncells + idn]; + Real mx = conserved[grid_enum::momentum_x * ncells + idn]; + Real my = conserved[grid_enum::momentum_y * ncells + idn]; + Real mz = conserved[grid_enum::momentum_z * ncells + idn]; + Real E = conserved[grid_enum::Energy * ncells + idn]; + + // this function CAN use the "advected internal energy" field to compute pressure when the dual energy + // formalism. This is because this function has an explicit pre-condition that this function can only + // be applied when the "advected internal energy" field and the "total energy" fields are properly reconciled +#ifdef DE + Real P = hydro_utilities::Get_Pressure_From_DE( + E, E - hydro_utilities::Calc_Kinetic_Energy_From_Momentum(d, mx, my, mz), + conserved[grid_enum::GasEnergy * ncells + id], gamma); +#else + Real P = hydro_utilities::Calc_Pressure_Conserved(E, d, mx, my, mz, gamma); +#endif + #ifdef SCALAR for (int n = 0; n < NSCALARS; n++) { // NOLINT scalar[n] = conserved[grid_enum::scalar * ncells + idn]; } #endif - if (d > 0.0 && P > 0.0) { + Real d_inv = 1.0 / d; + if (d > 0.0 && P > 0.0 && not slow_check.is_slow(E, d, d_inv, mx * d_inv, my * d_inv, mz * d_inv, gamma)) { d_av += d; vx_av += mx; vy_av += my; @@ -1304,11 +1455,21 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int } #endif N++; + } else if ((ii != i) || (jj != j) || (kk != k)) { +#ifdef DE + Real Udens = conserved[grid_enum::GasEnergy * ncells + idn]; +#else + Real Udens = + -123456789; // set to a dumb-looking number so that it's clear that it's not real when printing it +#endif + printf("%3d %3d %3d skipped-neighbor: d: %e E:%e P:%e vx:%e vy:%e vz:%e Uadv:%e\n", ii, jj, kk, d, E, P, + mx / d, my / d, mz / d, Udens); } } } } + // update the accumulator variables so that they now hold averages P_av = P_av / N; vx_av = vx_av / d_av; vy_av = vy_av / d_av; @@ -1318,17 +1479,18 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int scalar_av[n] = scalar_av[n] / d_av; } #endif - d_av = d_av / N; + d_av = d_av / N; + Real Udens_av = P_av / (gamma - 1.0); + Real E_av = Udens_av + 0.5 * d_av * math_utils::SquareMagnitude(vx_av, vy_av, vz_av); // replace cell values with new averaged values conserved[id + ncells * grid_enum::density] = d_av; conserved[id + ncells * grid_enum::momentum_x] = d_av * vx_av; conserved[id + ncells * grid_enum::momentum_y] = d_av * vy_av; conserved[id + ncells * grid_enum::momentum_z] = d_av * vz_av; - conserved[id + ncells * grid_enum::Energy] = - P_av / (gamma - 1.0) + 0.5 * d_av * (vx_av * vx_av + vy_av * vy_av + vz_av * vz_av); + conserved[id + ncells * grid_enum::Energy] = E_av; #ifdef DE - conserved[id + ncells * grid_enum::GasEnergy] = P_av / (gamma - 1.0); + conserved[id + ncells * grid_enum::GasEnergy] = Udens_av; #endif #ifdef SCALAR for (int n = 0; n < NSCALARS; n++) { // NOLINT @@ -1336,11 +1498,12 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int } #endif - d = d_av; - E = P_av / (gamma - 1.0) + 0.5 * d_av * (vx_av * vx_av + vy_av * vy_av + vz_av * vz_av); - P = P_av; + // print out the values now that they have been replaced + printf("%3d %3d %3d FC: d: %e E:%e P:%e vx:%e vy:%e vz:%e Udens:%e\n", i, j, k, d_av, E_av, P_av, vx_av, vy_av, + vz_av, Udens_av); - printf("%3d %3d %3d FC: d: %e E:%e P:%e vx:%e vy:%e vz:%e\n", i, j, k, d, E, P, vx_av, vy_av, vz_av); + // success requires that at least 2 cells contributed to the average + return N > 1; } void Apply_Scalar_Floor(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int field_num, Real scalar_floor) diff --git a/src/hydro/hydro_cuda.h b/src/hydro/hydro_cuda.h index 61b1073de..f8c48d95b 100644 --- a/src/hydro/hydro_cuda.h +++ b/src/hydro/hydro_cuda.h @@ -5,6 +5,7 @@ #define HYDRO_CUDA_H #include "../global/global.h" +#include "../hydro/average_cells.h" #include "../utils/mhd_utilities.h" __global__ void Update_Conserved_Variables_1D(Real *dev_conserved, Real *dev_F, int n_cells, int x_off, int n_ghost, @@ -21,6 +22,10 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R Real gamma, int n_fields, int custom_grav, Real density_floor, Real *dev_potential); +__global__ void PostUpdate_Conserved_Correct_Crashed_3D(Real *dev_conserved, int nx, int ny, int nz, int x_off, + int y_off, int z_off, int n_ghost, Real gamma, int n_fields, + SlowCellConditionChecker slow_check, int *any_error); + /*! * \brief Determine the maximum inverse crossing time in a specific cell * @@ -80,17 +85,6 @@ void Temperature_Ceiling(Real *dev_conserved, int nx, int ny, int nz, int n_ghos Real T_ceiling); #endif // TEMPERATURE CEILING -#ifdef AVERAGE_SLOW_CELLS - -void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy, - Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, Real zbound, int nx_offset, - int ny_offset, int nz_offset); - -__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, - Real dy, Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, - Real zbound, int nx_offset, int ny_offset, int nz_offset); -#endif - void Apply_Temperature_Floor(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real U_floor); __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, @@ -119,8 +113,49 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields); -__device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields, - Real gamma, Real *conserved); +/*! \brief Overwrites the values in the specified cell with the average of all the values from the (up to) 26 + * neighboring cells. + * + * Care is taken when applying this logic to a cell near the edge of a block (where the entire simulation domain + * is decomposed into 1 or more blocks). + * * Recall that the entire reason we have ghost zones is that the stencil for computing flux-divergence can't + * be applied uniformly to all cells -- the cells in the ghost zone can't be properly updated with the rest + * the local block when applying the flux-divergence. We might refer to these cells that aren't properly + * updated as being "stale". We refer to the width of the outer ring of stale values as the ``stale-depth`` + * * For concreteness, consider a pure hydro/mhd simulation using the VL integrator: + * - Right after refreshing the ghost-zones, the stale_depth is 0 + * - After the partial time-step, the stale_depth is 1. + * - After the full timestep, the stale depth depends on the choice of reconstruction. (e.g. it is 2 for + * for nearest neighbor and 3 for plmp). + * - The ghost-depth should always be equal to the max stale-depth at the end of a simulation cycle (if + * ghost-depth is bigger, extra work is done. If it's smaller, then your simulation is wrong) + * * To respect the simulations boundaries, values in "stale" cells are excluded from the averages. If + * stale-depth is 0, then values from beyond the edge of the simulation are excluded from averages + * + * \pre When using the dual-energy formalism, this function should **ONLY** be called in contexts where all + * "non-crashed" cells are known to have *reconciled* "advected internal energy" and "total energy" field values. + * This is important for averaging. + * - In practice, this limitation is mostly meaningful inside of a hydro solver + * - Outside of a hydro-solver, you can definitely come up with pathological cases where applying source terms, + * but you would need to go out of your way to do something to violate this precondition + * + * \returns A value of ``true`` indicates that the operation succeeded. A value of ``false`` indicates a failure. + * To succeed, this function requires that there are at least 2 neighboring cells with valid values that can be + * used to compute the average. + * + * \note + * From a perfectionist's perspective, one could argue that we really should increment the stale-depth whenever + * we call this function (in other words, we should have an extra layer of ghost zones for each time we call + * this function). + * * rationale: if we don't, then the the number of neighbors considered results of the simulation can vary + * based on how close a cell is to a block-edge (the number of cells varies from 7 to 26). + * * more pragmatically: this probably doesn't matter a whole lot given that this piece of machinery is a + * band-aid solution to begin with. + * * Aside: a similar argument could be made for the energy-synchronization step of the dual-energy formalism. + */ +__device__ bool Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields, + Real gamma, Real *conserved, int stale_depth, + SlowCellConditionChecker slow_check); __device__ Real Average_Cell_Single_Field(int field_indx, int i, int j, int k, int nx, int ny, int nz, int ncells, Real *conserved); diff --git a/src/integrators/VL_3D_cuda.cu b/src/integrators/VL_3D_cuda.cu index 0f9ccc013..074f6cdff 100644 --- a/src/integrators/VL_3D_cuda.cu +++ b/src/integrators/VL_3D_cuda.cu @@ -35,7 +35,8 @@ __global__ void Update_Conserved_Variables_3D_half(Real *dev_conserved, Real *de void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off, int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, - Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential) + Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential, + const SlowCellConditionChecker &slow_check, int *any_error) { // Here, *dev_conserved contains the entire // set of conserved variables on the grid @@ -392,6 +393,19 @@ void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int GPU_Error_Check(); #endif // DE + // Step 7: Address any crashed threads + // -> it's REALLY important that we do this **AFTER** the dual energy formalism update + // -> in more detail, galaxy simulations with particle-feedback, seem to commonly produce scenarios where a cell + // crashes and all of its neighbors have positive total energies that are smaller than the respective kinetic + // energies (this problem might be resolved if we use larger deposition stencils) + // -> by positioning this correction after the dual energy formalism, we can correct for this issue. + cuda_utilities::AutomaticLaunchParams static const post_update_correction_launch_params( + PostUpdate_Conserved_Correct_Crashed_3D, n_cells); + hipLaunchKernelGGL(PostUpdate_Conserved_Correct_Crashed_3D, post_update_correction_launch_params.get_numBlocks(), + post_update_correction_launch_params.get_threadsPerBlock(), 0, 0, dev_conserved, nx, ny, nz, x_off, + y_off, z_off, n_ghost, gama, n_fields, slow_check, any_error); + GPU_Error_Check(); + return; } diff --git a/src/integrators/VL_3D_cuda.h b/src/integrators/VL_3D_cuda.h index 4b80a4604..b7af66efb 100644 --- a/src/integrators/VL_3D_cuda.h +++ b/src/integrators/VL_3D_cuda.h @@ -5,10 +5,12 @@ #define VL_3D_CUDA_H #include "../global/global.h" +#include "../hydro/average_cells.h" void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off, int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, - Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential); + Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential, + const SlowCellConditionChecker &slow_check, int *any_error); void Free_Memory_VL_3D(); diff --git a/src/integrators/simple_3D_cuda.cu b/src/integrators/simple_3D_cuda.cu index 2e834e370..3e38e1dbf 100644 --- a/src/integrators/simple_3D_cuda.cu +++ b/src/integrators/simple_3D_cuda.cu @@ -24,7 +24,8 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off, int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, - Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential) + Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential, + const SlowCellConditionChecker &slow_check, int *any_error) { // Here, *dev_conserved contains the entire // set of conserved variables on the grid @@ -154,6 +155,9 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, zbound, dt, gama, n_fields, custom_grav, density_floor, dev_grav_potential); GPU_Error_Check(); + // BE AWARE: any "crashed cells" aren't handled yet! We explicitly defer handling of these cases until after + // we handle the Dual Energy Formalism + #ifdef DE hipLaunchKernelGGL(Select_Internal_Energy_3D, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields); @@ -161,6 +165,14 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, GPU_Error_Check(); #endif + // Step 4: Address any crashed threads + cuda_utilities::AutomaticLaunchParams static const post_update_correction_launch_params( + PostUpdate_Conserved_Correct_Crashed_3D, n_cells); + hipLaunchKernelGGL(PostUpdate_Conserved_Correct_Crashed_3D, post_update_correction_launch_params.get_numBlocks(), + post_update_correction_launch_params.get_threadsPerBlock(), 0, 0, dev_conserved, nx, ny, nz, x_off, + y_off, z_off, n_ghost, gama, n_fields, slow_check, any_error); + GPU_Error_Check(); + return; } diff --git a/src/integrators/simple_3D_cuda.h b/src/integrators/simple_3D_cuda.h index 847b93c61..6f262a1d7 100644 --- a/src/integrators/simple_3D_cuda.h +++ b/src/integrators/simple_3D_cuda.h @@ -9,7 +9,8 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off, int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, - Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential); + Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential, + const SlowCellConditionChecker &slow_check, int *any_error); void Free_Memory_Simple_3D(); diff --git a/src/utils/gpu.hpp b/src/utils/gpu.hpp index b4eac6607..196db01e4 100644 --- a/src/utils/gpu.hpp +++ b/src/utils/gpu.hpp @@ -134,20 +134,18 @@ static constexpr int maxWarpsPerBlock = 1024 / WARPSIZE; #define GPU_MAX_THREADS 256 /*! - * \brief Check for CUDA/HIP error codes. Can be called wrapping a GPU function that returns a value or with no - * arguments and it will get the latest error code. + * \brief formats and reports CUDA/HIP errors. This a helper function invoked by GPU_Error_Check * * \param[in] code The code to check. Defaults to the last error code - * \param[in] abort Whether or not to abort if an error is encountered. Defaults to True + * \param[in] abort Whether or not to abort if an error is encountered. * \param[in] location The location of the call. This should be left as the default value. */ -inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort = true, - std::experimental::source_location location = std::experimental::source_location::current()) +inline void Format_GPU_Error_Check_Err_(cudaError_t code, bool abort, std::experimental::source_location location) { -#ifndef DISABLE_GPU_ERROR_CHECKING - code = cudaDeviceSynchronize(); + // this function should ALWAYS be compiled, (regardless of whether DISABLE_GPU_ERROR_CHECKING is + // defined or not). We will instead conditionally compile calls to this of this function based on + // whether DISABLE_GPU_ERROR_CHECKING is defined. - // Check the code if (code != cudaSuccess) { std::cout << "GPU_Error_Check: Failed at " << "Line: " << location.line() << ", File: " << location.file_name() @@ -156,6 +154,50 @@ inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort chexit(code); } } +} + +/*! + * \brief Check for CUDA/HIP error codes. Can be called wrapping a GPU function that returns a value or with no + * arguments and it will get the latest error code. + * + * This function will always check any cuda-exit code explicitly passed into it. If no code is passed, the + * function assumes that it's being called immediately after a kernel was launched and performs a cheap + * query to make sure that there aren't currently any errors from the kernel launch (this will pick up on any + * errors if you haven't recently performed an error check) + * + * Unless the DISABLE_GPU_ERROR_CHECKING macro is defined, this function will then explicitly synchronize the + * device with the host in order to check if there were any errors during the kernel's evaluation. + * + * \param[in] code The code to check. Defaults to the last error code + * \param[in] abort Whether or not to abort if an error is encountered. Defaults to True + * \param[in] location The location of the call. This should be left as the default value. + * + * @note + * The fact that we include function calls as default arguments may appear irregular. If the caller doesn't override + * the default values, then the functions are guaranteed to be freshly called whenever we invoke ``GPU_Error_Check`` + * (details are provided on https://en.cppreference.com/w/cpp/language/default_arguments) + * + * @note + * As explained at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#error-checking + * if we want a call to this function right after a kernel launch to be able to detect errors that prevented the + * kernel launch it is VERY important that the function explicitly checks an error code retrieved with either + * ``cudaPeekAtLastError()`` or ``cudaGetLastError()``. If we don't do this, the function won't automatically + * detect launch errors. + */ +inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort = true, + std::experimental::source_location location = std::experimental::source_location::current()) +{ + // first, we check the argument passed into this function + // - earlier versions of this function did not perform this check. + // - as noted up in the docstring, it's extremely important to perform this check in addition to the next + // check because the next check won't identify cases where the kernel fails to launch + // - there's basically no reason to disable this check (it is extremely cheap) + if (code != cudaSuccess) Format_GPU_Error_Check_Err_(code, abort, location); + +#ifndef DISABLE_GPU_ERROR_CHECKING + // wait for any kernels being evaluated to finish and check if any errors arose in the process + code = cudaDeviceSynchronize(); + if (code != cudaSuccess) Format_GPU_Error_Check_Err_(code, abort, location); #endif // DISABLE_GPU_ERROR_CHECKING }