diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp
index 9e7e9d5cc..27214204a 100644
--- a/src/grid/grid3D.cpp
+++ b/src/grid/grid3D.cpp
@@ -8,8 +8,9 @@
 #endif
 #include "../global/global.h"
 #include "../grid/grid3D.h"
-#include "../grid/grid_enum.h"    // provides grid_enum
-#include "../hydro/hydro_cuda.h"  // provides Calc_dt_GPU
+#include "../grid/grid_enum.h"       // provides grid_enum
+#include "../hydro/average_cells.h"  // provides Average_Slow_Cells and SlowCellConditionChecker
+#include "../hydro/hydro_cuda.h"     // provides Calc_dt_GPU
 #include "../integrators/VL_1D_cuda.h"
 #include "../integrators/VL_2D_cuda.h"
 #include "../integrators/VL_3D_cuda.h"
@@ -17,6 +18,7 @@
 #include "../integrators/simple_2D_cuda.h"
 #include "../integrators/simple_3D_cuda.h"
 #include "../io/io.h"
+#include "../utils/DeviceVector.h"
 #include "../utils/error_handling.h"
 #ifdef MPI_CHOLLA
   #include <mpi.h>
@@ -152,7 +154,9 @@ void Grid3D::Initialize(struct Parameters *P)
 
 #ifdef AVERAGE_SLOW_CELLS
   H.min_dt_slow = 1e-100;  // Initialize the minumum dt to a tiny number
-#endif                     // AVERAGE_SLOW_CELLS
+#else
+  H.min_dt_slow = -1.0;
+#endif  // AVERAGE_SLOW_CELLS
 
 #ifndef MPI_CHOLLA
 
@@ -422,6 +426,9 @@ void Grid3D::Execute_Hydro_Integrator(void)
   Timer.Hydro_Integrator.Start();
 #endif  // CPU_TIME
 
+  // this buffer holds 1 element that is initialized to 0
+  cuda_utilities::DeviceVector<int> error_code_buffer(1, true);
+
   // Run the hydro integrator on the grid
   if (H.nx > 1 && H.ny == 1 && H.nz == 1)  // 1D
   {
@@ -446,18 +453,24 @@ void Grid3D::Execute_Hydro_Integrator(void)
 #ifdef VL
     VL_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
                          H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor,
-                         C.Grav_potential);
+                         C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz),
+                         error_code_buffer.data());
 #endif  // VL
 #ifdef SIMPLE
     Simple_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
                              H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor,
-                             C.Grav_potential);
+                             C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz),
+                             error_code_buffer.data());
 #endif  // SIMPLE
   } else {
     chprintf("Error: Grid dimensions nx: %d  ny: %d  nz: %d  not supported.\n", H.nx, H.ny, H.nz);
     chexit(-1);
   }
 
+  if (error_code_buffer[0] != 0) {
+    CHOLLA_ERROR("An error occurred during the hydro calculation.");
+  }
+
 #ifdef CPU_TIME
   Timer.Hydro_Integrator.End(true);
 #endif  // CPU_TIME
@@ -578,8 +591,9 @@ Real Grid3D::Update_Hydro_Grid()
   ny_off = ny_local_start;
   nz_off = nz_local_start;
   #endif
-  Average_Slow_Cells(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dx, H.dy, H.dz, gama, max_dti_slow, H.xbound,
-                     H.ybound, H.zbound, nx_off, ny_off, nz_off);
+  Average_Slow_Cells(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, gama,
+                     SlowCellConditionChecker(max_dti_slow, H.dx, H.dy, H.dz), H.xbound, H.ybound, H.zbound, nx_off,
+                     ny_off, nz_off);
 #endif  // AVERAGE_SLOW_CELLS
 
   // ==Calculate the next time step using Calc_dt_GPU from hydro/hydro_cuda.h==
diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h
index cb4c0dbbb..1b417afa8 100644
--- a/src/grid/grid3D.h
+++ b/src/grid/grid3D.h
@@ -210,9 +210,10 @@ struct Header {
    *  \brief Length of the current timestep */
   Real dt;
 
-#ifdef AVERAGE_SLOW_CELLS
+  /*! \brief Cells that introduce timesteps shorter than will be averaged with
+   *         neighboring cells. Should be a negative value when the
+   *         AVERAGE_SLOW_CELLS macro isn't defined. */
   Real min_dt_slow;
-#endif
 
   /*! \var t_wall
    *  \brief Wall time */
diff --git a/src/hydro/average_cells.h b/src/hydro/average_cells.h
new file mode 100644
index 000000000..9f49f95f6
--- /dev/null
+++ b/src/hydro/average_cells.h
@@ -0,0 +1,56 @@
+/*! \file average_cells.h
+ *  \brief Definitions of functions and classes that implement logic related to averaging cells with
+ *         neighbors. */
+
+#ifndef AVERAGE_CELLS_H
+#define AVERAGE_CELLS_H
+
+#include <math.h>
+
+#include "../global/global.h"
+
+/*! \brief Object that checks whether a given cell meets the conditions for slow-cell averaging.
+ *          The main motivation for creating this class is reducing ifdef statements (and allow to modify the
+ *          actual slow-cell-condition. */
+struct SlowCellConditionChecker {
+// omit data-members if they aren't used for anything
+#ifdef AVERAGE_SLOW_CELLS
+  Real max_dti_slow, dx, dy, dz;
+#endif
+
+  /*! \brief Construct a new object. */
+  __host__ __device__ SlowCellConditionChecker(Real max_dti_slow, Real dx, Real dy, Real dz)
+#ifdef AVERAGE_SLOW_CELLS
+      : max_dti_slow{max_dti_slow}, dx{dx}, dy{dy}, dz{dz}
+#endif
+  {
+  }
+
+  /*! \brief Returns whether the cell meets the condition for being considered a slow cell that must
+   *  be averaged. */
+  template <bool verbose = false>
+  __device__ bool is_slow(Real total_energy, Real density, Real density_inv, Real velocity_x, Real velocity_y,
+                          Real velocity_z, Real gamma) const
+  {
+    return this->max_dti_if_slow(total_energy, density, density_inv, velocity_x, velocity_y, velocity_z, gamma) > 0.0;
+  }
+
+  /*! \brief Returns the max inverse timestep of the specified cell, if it meets the criteria for being
+   *  a slow cell. If it doesn't, return a negative value instead.
+   */
+  __device__ Real max_dti_if_slow(Real total_energy, Real density, Real density_inv, Real velocity_x, Real velocity_y,
+                                  Real velocity_z, Real gamma) const;
+};
+
+#ifdef AVERAGE_SLOW_CELLS
+
+void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real gamma,
+                        SlowCellConditionChecker slow_check, Real xbound, Real ybound, Real zbound, int nx_offset,
+                        int ny_offset, int nz_offset);
+
+__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                                      Real gamma, SlowCellConditionChecker slow_check, Real xbound, Real ybound,
+                                      Real zbound, int nx_offset, int ny_offset, int nz_offset);
+#endif
+
+#endif /* AVERAGE_CELLS_H */
\ No newline at end of file
diff --git a/src/hydro/hydro_cuda.cu b/src/hydro/hydro_cuda.cu
index 1b223708b..a15173608 100644
--- a/src/hydro/hydro_cuda.cu
+++ b/src/hydro/hydro_cuda.cu
@@ -10,6 +10,7 @@
 #include "../global/global.h"
 #include "../global/global_cuda.h"
 #include "../gravity/static_grav.h"
+#include "../hydro/average_cells.h"
 #include "../hydro/hydro_cuda.h"
 #include "../utils/DeviceVector.h"
 #include "../utils/cuda_utilities.h"
@@ -377,14 +378,50 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R
         0.5 * dt * (gx * (d * vx + d_n * vx_n) + gy * (d * vy + d_n * vy_n) + gz * (d * vz + d_n * vz_n));
 
 #endif  // GRAVITY
+  }
+}
+
+/*! Returns whether a cell has crashed
+ *
+ *  \note
+ *  It probably won't come up, but it's unclear why we don't consider a density of 0 or
+ *  energy of 0 to be crashed... (I'm just keeping the logic consistent with what it used to be)
+ */
+__device__ bool Cell_Is_Crashed(Real density, Real Etot_density)
+{
+  return (density < 0.0) || (density != density) || (Etot_density < 0.0) || (Etot_density != Etot_density);
+}
 
+__global__ void PostUpdate_Conserved_Correct_Crashed_3D(Real *dev_conserved, int nx, int ny, int nz, int x_off,
+                                                        int y_off, int z_off, int n_ghost, Real gamma, int n_fields,
+                                                        SlowCellConditionChecker slow_check, int *any_error)
+{
+  int n_cells = nx * ny * nz;
+
+  // get a global thread ID
+  int id  = threadIdx.x + blockIdx.x * blockDim.x;
+  int zid = id / (nx * ny);
+  int yid = (id - zid * nx * ny) / nx;
+  int xid = id - zid * nx * ny - yid * nx;
+
+  if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 &&
+      zid < nz - n_ghost) {
+    // threads corresponding to real cells do the calculation
+
+    // this logic get's skipped if we apply both a density floor and temperature floor
+    // (we apply the logic if we only use one kind of floor or we use neither kind)
 #if !(defined(DENSITY_FLOOR) && defined(TEMPERATURE_FLOOR))
-    if (dev_conserved[id] < 0.0 || dev_conserved[id] != dev_conserved[id] || dev_conserved[4 * n_cells + id] < 0.0 ||
-        dev_conserved[4 * n_cells + id] != dev_conserved[4 * n_cells + id]) {
-      printf("%3d %3d %3d Thread crashed in final update. %e %e %e %e %e\n", xid + x_off, yid + y_off, zid + z_off,
-             dev_conserved[id], dtodx * (dev_F_x[imo] - dev_F_x[id]), dtody * (dev_F_y[jmo] - dev_F_y[id]),
-             dtodz * (dev_F_z[kmo] - dev_F_z[id]), dev_conserved[4 * n_cells + id]);
-      Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved);
+    if (Cell_Is_Crashed(dev_conserved[grid_enum::density * n_cells + id],
+                        dev_conserved[grid_enum::Energy * n_cells + id])) {
+      printf("%3d %3d %3d Thread crashed in final update. %e - - - %e\n", xid + x_off, yid + y_off, zid + z_off,
+             dev_conserved[grid_enum::density * n_cells + id], dev_conserved[grid_enum::Energy * n_cells + id]);
+      bool success = Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved,
+                                             n_ghost, slow_check);
+      if (!success) {
+        printf("%3d %3d %3d there was an issue with averaging the neighboring cells\n", xid + x_off, yid + y_off,
+               zid + z_off);
+        *any_error = 1;
+      }
     }
 #endif  // DENSITY_FLOOR
     /*
@@ -400,7 +437,6 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R
     */
   }
 }
-
 __device__ __host__ Real hydroInverseCrossingTime(Real const &E, Real const &d, Real const &d_inv, Real const &vx,
                                                   Real const &vy, Real const &vz, Real const &dx, Real const &dy,
                                                   Real const &dz, Real const &gamma)
@@ -667,10 +703,23 @@ void Temperature_Ceiling(Real *dev_conserved, int nx, int ny, int nz, int n_ghos
   }
 }
 
+__device__ Real SlowCellConditionChecker::max_dti_if_slow(Real total_energy, Real density, Real density_inv,
+                                                          Real velocity_x, Real velocity_y, Real velocity_z,
+                                                          Real gamma) const
+{
+#ifndef AVERAGE_SLOW_CELLS
+  return -1.0;
+#else
+  Real max_dti = hydroInverseCrossingTime(total_energy, density, density_inv, velocity_x, velocity_y, velocity_z,
+                                          this->dx, this->dy, this->dz, gamma);
+  return (max_dti > this->max_dti_slow) ? max_dti : -1.0;
+#endif
+}
+
 #ifdef AVERAGE_SLOW_CELLS
 
-void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy,
-                        Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, Real zbound, int nx_offset,
+void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real gamma,
+                        SlowCellConditionChecker slow_check, Real xbound, Real ybound, Real zbound, int nx_offset,
                         int ny_offset, int nz_offset)
 {
   // set values for GPU kernels
@@ -683,12 +732,12 @@ void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost
 
   if (nx > 1 && ny > 1 && nz > 1) {  // 3D
     hipLaunchKernelGGL(Average_Slow_Cells_3D, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields,
-                       dx, dy, dz, gamma, max_dti_slow, xbound, ybound, zbound, nx_offset, ny_offset, nz_offset);
+                       gamma, slow_check, xbound, ybound, zbound, nx_offset, ny_offset, nz_offset);
   }
 }
 
-__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx,
-                                      Real dy, Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound,
+__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                                      Real gamma, SlowCellConditionChecker slow_check, Real xbound, Real ybound,
                                       Real zbound, int nx_offset, int ny_offset, int nz_offset)
 {
   int id, xid, yid, zid, n_cells;
@@ -711,25 +760,26 @@ __global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int n
     vz    = dev_conserved[3 * n_cells + id] * d_inv;
     E     = dev_conserved[4 * n_cells + id];
 
-    // Compute the maximum inverse crossing time in the cell
-    max_dti = hydroInverseCrossingTime(E, d, d_inv, vx, vy, vz, dx, dy, dz, gamma);
+    // retrieve the max inverse crossing time in the cell if the cell meets the threshold for being a slow-cell.
+    // (if the cell doesn't meet the threshold, a negative value is returned instead)
+    max_dti = slow_check.max_dti_if_slow(E, d, d_inv, vx, vy, vz, gamma);
 
-    if (max_dti > max_dti_slow) {
+    if (max_dti > 0) {
       speed  = sqrt(vx * vx + vy * vy + vz * vz);
       temp   = (gamma - 1) * (E - 0.5 * (speed * speed) * d) * ENERGY_UNIT / (d * DENSITY_UNIT / 0.6 / MP) / KB;
       P      = (E - 0.5 * d * (vx * vx + vy * vy + vz * vz)) * (gamma - 1.0);
       cs     = sqrt(d_inv * gamma * P) * VELOCITY_UNIT * 1e-5;
-      Real x = xbound + (nx_offset + xid - n_ghost + 0.5) * dx;
-      Real y = ybound + (ny_offset + yid - n_ghost + 0.5) * dy;
-      Real z = zbound + (nz_offset + zid - n_ghost + 0.5) * dz;
+      Real x = xbound + (nx_offset + xid - n_ghost + 0.5) * slow_check.dx;
+      Real y = ybound + (ny_offset + yid - n_ghost + 0.5) * slow_check.dy;
+      Real z = zbound + (nz_offset + zid - n_ghost + 0.5) * slow_check.dz;
       // Average this cell
       kernel_printf(
           " Average Slow Cell [ %.5e %.5e %.5e ] -> dt_cell=%f    dt_min=%f, n=%.3e, "
           "T=%.3e, v=%.3e (%.3e, %.3e, %.3e), cs=%.3e\n",
-          x, y, z, 1. / max_dti, 1. / max_dti_slow, dev_conserved[id] * DENSITY_UNIT / 0.6 / MP, temp,
+          x, y, z, 1. / max_dti, 1. / slow_check.max_dti_slow, dev_conserved[id] * DENSITY_UNIT / 0.6 / MP, temp,
           speed * VELOCITY_UNIT * 1e-5, vx * VELOCITY_UNIT * 1e-5, vy * VELOCITY_UNIT * 1e-5, vz * VELOCITY_UNIT * 1e-5,
           cs);
-      Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved);
+      Average_Cell_All_Fields(xid, yid, zid, nx, ny, nz, n_cells, n_fields, gamma, dev_conserved, n_ghost, slow_check);
     }
   }
 }
@@ -908,6 +958,20 @@ __global__ void Partial_Update_Advected_Internal_Energy_3D(Real *dev_conserved,
   }
 }
 
+/*! The folliowing function is used to retrieve the total energy density if the cell isn't crashed.
+ *  If the cell is returned, 0 is returned
+ *
+ *  \note
+ *  This is useful for implementing `Select_Internal_Energy_ND`
+ */
+__device__ Real E_If_Not_Crashed(Real *dev_conserved, int n_cells, int spatial_idx)
+{
+  Real d = dev_conserved[grid_enum::density * n_cells + spatial_idx];
+  Real E = dev_conserved[grid_enum::Energy * n_cells + spatial_idx];
+  // NOTE: don't try to get clever here (we explicitly want to return 0 in cases where E is NaN)
+  return Cell_Is_Crashed(d, E) ? 0 : E;
+}
+
 __global__ void Select_Internal_Energy_1D(Real *dev_conserved, int nx, int n_ghost, int n_fields)
 {
   int id, xid, n_cells;
@@ -937,14 +1001,25 @@ __global__ void Select_Internal_Energy_1D(Real *dev_conserved, int nx, int n_gho
     U_advected = dev_conserved[(n_fields - 1) * n_cells + id];
     U_total    = E - 0.5 * d * (vx * vx + vy * vy + vz * vz);
 
-    // find the max nearby total energy
-    Emax = fmax(dev_conserved[4 * n_cells + imo], E);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]);
+    // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D
+    // simulation always uses floors for this purpose)
+    if (Cell_Is_Crashed(d, E)) return;
+
+    // find the max nearby total energy (from the local cell and any uncrashed neighbors)
+    // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok
+    // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0,
+    //    instead of the neighboring cell's E if the neighbor crashed
+    Emax = E;
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo));
 
-    // We only use the "advected" internal energy if both:
+    // Ordinarily, we only use the "advected" internal energy if both:
     // - the thermal energy divided by total energy is a small fraction (smaller than eta_1)
     // - AND we aren't masking shock heating (details controlled by Emax & eta_2)
-    if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) {
+    // We ALSO explicitly use the "advected" internal energy if the total energy is positive but doesn't
+    // exceed kinetic energy (i.e. U_total <= 0).
+    bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2);
+    if (prefer_U_total and (U_total > 0)) {
       U = U_total;
     } else {
       U = U_advected;
@@ -997,16 +1072,27 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i
     U_advected = dev_conserved[(n_fields - 1) * n_cells + id];
     U_total    = E - 0.5 * d * (vx * vx + vy * vy + vz * vz);
 
-    // find the max nearby total energy
-    Emax = fmax(dev_conserved[4 * n_cells + imo], E);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + jmo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + jpo]);
-
-    // We only use the "advected" internal energy if both:
+    // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D
+    // simulation always uses floors for this purpose)
+    if (Cell_Is_Crashed(d, E)) return;
+
+    // find the max nearby total energy (from the local cell and any uncrashed neighbors)
+    // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok
+    // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0,
+    //    instead of the neighboring cell's E if the neighbor crashed
+    Emax = E;
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jmo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jpo));
+
+    // We only use the "advected" internal energy if the following 3 conditions are satisfied:
     // - the thermal energy divided by total energy is a small fraction (smaller than eta_1)
     // - AND we aren't masking shock heating (details controlled by Emax & eta_2)
-    if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) {
+    // We ALSO explicitly use the "advected" internal energy if the total energy is positive but doesn't
+    // exceed kinetic energy (i.e. U_total <= 0).
+    bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2);
+    if (prefer_U_total and (U_total > 0)) {
       U = U_total;
     } else {
       U = U_advected;
@@ -1024,6 +1110,15 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i
 
 __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields)
 {
+  // The scenario where E > 0 and E doesn't exceed the kinetic energy (i.e. U_total <= 0),
+  // comes up with some frequency when we include particle feedback.
+  // -> in that scenario, we explicitly use the value held by the U_advected field
+  // -> the separate `Sync_Energies_3D` kernel, we then override the E field with KE + U_advected
+  // -> one might argue we should actually override the E field with KE + U_advected before this kernel
+  //    (we leave that for future consideration)
+  // -> regardless, it is **REALLY IMPORTANT** that the E field is **NOT** modified in this kernel
+  //    (modifying it will produce race conditions!!!)
+
   int id, xid, yid, zid, n_cells;
   Real d, d_inv, vx, vy, vz, E, U_total, U_advected, U, Emax;
   int imo, ipo, jmo, jpo, kmo, kpo;
@@ -1064,18 +1159,30 @@ __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, i
     U_advected = dev_conserved[(n_fields - 1) * n_cells + id];
     U_total    = E - 0.5 * d * (vx * vx + vy * vy + vz * vz);
 
-    // find the max nearby total energy
-    Emax = fmax(dev_conserved[4 * n_cells + imo], E);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + ipo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + jmo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + jpo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + kmo]);
-    Emax = fmax(Emax, dev_conserved[4 * n_cells + kpo]);
-
-    // We only use the "advected" internal energy if both:
+    // We will deal with this crashed cell later in a different kernel... (at the time of writing, a 1D
+    // simulation always uses floors for this purpose)
+    if (Cell_Is_Crashed(d, E)) return;
+
+    // find the max nearby total energy (from the local cell and any uncrashed neighbors)
+    // -> we take the stance that "crashed" neighbors are unreliable, even if total energy looks ok
+    // -> to effectively ignore a crashed neighbor, the `E_If_Not_Crashed` function provides a value of 0,
+    //    instead of the neighboring cell's E if the neighbor crashed
+    Emax = E;
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, imo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, ipo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jmo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, jpo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, kmo));
+    Emax = fmax(Emax, E_If_Not_Crashed(dev_conserved, n_cells, kpo));
+
+    // Ordinarily, we only use the "advected" internal energy if both:
     // - the thermal energy divided by total energy is a small fraction (smaller than eta_1)
     // - AND we aren't masking shock heating (details controlled by Emax & eta_2)
-    if ((U_total / E > eta_1) or (U_total / Emax > eta_2)) {
+    // We also explicitly use the "advected" internal energy if the total energy is positive but doesn't
+    // exceed kinetic energy (i.e. U_total <= 0). This scenario comes up in simulations with particle-based
+    // feedback.
+    bool prefer_U_total = (U_total > E * eta_1) or (U_total > Emax * eta_2);
+    if (prefer_U_total and (U_total > 0)) {
       U = U_total;
     } else {
       U = U_advected;
@@ -1194,7 +1301,7 @@ __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, in
                                          Real U_floor)
 {
   int id, xid, yid, zid, n_cells;
-  Real d, d_inv, vx, vy, vz, E, Ekin, U;
+  Real d, d_inv, vx, vy, vz, E, Ekin, Udens;
   n_cells = nx * ny * nz;
 
   // get a global thread ID
@@ -1214,17 +1321,28 @@ __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, in
     E     = dev_conserved[4 * n_cells + id];
     Ekin  = 0.5 * d * (vx * vx + vy * vy + vz * vz);
 
-    U = (E - Ekin) / d;
-    if (U < U_floor) {
+    int num_applications = 0;
+
+    Udens = (E - Ekin);
+    if (Udens / d < U_floor) {
+      num_applications++;
       dev_conserved[4 * n_cells + id] = Ekin + d * U_floor;
     }
 
 #ifdef DE
-    U = dev_conserved[(n_fields - 1) * n_cells + id] / d;
-    if (U < U_floor) {
+    Udens = dev_conserved[(n_fields - 1) * n_cells + id];
+    if (Udens / d < U_floor) {
+      num_applications++;
       dev_conserved[(n_fields - 1) * n_cells + id] = d * U_floor;
     }
+#else
+    Udens = -123456789;  // set to a dumb-looking number so that it's clear that it's not real when printing it
 #endif
+
+    if (num_applications > 0) {
+      printf("T_Floor %3d %3d %3d d: %e spec_Ekin:%e spec_eint_floor: %e BC: E_dens:%e GasEnergy_dens:%e\n", xid, yid,
+             zid, d, Ekin, U_floor, E, Udens);
+    }
   }
 }
 
@@ -1252,22 +1370,37 @@ __device__ Real Average_Cell_Single_Field(int field_indx, int i, int j, int k, i
   return v_avrg;
 }
 
-__device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields,
-                                        Real gamma, Real *conserved)
+__device__ bool Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields,
+                                        Real gamma, Real *conserved, int stale_depth,
+                                        SlowCellConditionChecker slow_check)
 {
   int id = i + (j)*nx + (k)*nx * ny;
 
-  Real d, mx, my, mz, E, P;
-  d  = conserved[grid_enum::density * ncells + id];
-  mx = conserved[grid_enum::momentum_x * ncells + id];
-  my = conserved[grid_enum::momentum_y * ncells + id];
-  mz = conserved[grid_enum::momentum_z * ncells + id];
-  E  = conserved[grid_enum::Energy * ncells + id];
-  P  = (E - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0);
+  // print out the the values stored in the cell before the correction
+  // -> we put this logic (and variable declarations in its own scope) to force us to redeclare variables later
+  //    (this lets the compiler help us avoid silly errors where we forget to load a variable)
+  {
+    Real d, mx, my, mz, E, P, Udens;
+    d  = conserved[grid_enum::density * ncells + id];
+    mx = conserved[grid_enum::momentum_x * ncells + id];
+    my = conserved[grid_enum::momentum_y * ncells + id];
+    mz = conserved[grid_enum::momentum_z * ncells + id];
+    E  = conserved[grid_enum::Energy * ncells + id];
+    P  = (E - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0);
 
-  printf("%3d %3d %3d BC: d: %e  E:%e  P:%e  vx:%e  vy:%e  vz:%e\n", i, j, k, d, E, P, mx / d, my / d, mz / d);
+#ifdef DE
+    Udens               = conserved[grid_enum::GasEnergy * ncells + id];
+    Real kinetic_energy = hydro_utilities::Calc_Kinetic_Energy_From_Momentum(d, mx, my, mz);
+    P                   = hydro_utilities::Get_Pressure_From_DE(E, E - kinetic_energy, Udens, gamma);
+#else
+    Udens = -123456789;  // set to a dumb-looking number so that it's clear that it's not real when printing it
+    P     = hydro_utilities::Calc_Pressure_Conserved(E, d, mx, my, mz, gamma);
+#endif
+    printf("%3d %3d %3d BC: d: %e  E:%e  P:%e  vx:%e  vy:%e  vz:%e  Uadv:%e\n", i, j, k, d, E, P, mx / d, my / d,
+           mz / d, Udens);
+  }
 
-  int idn;
+  // initialize the variables that we use to accumulate information about neighboring values
   int N = 0;
   Real d_av, vx_av, vy_av, vz_av, P_av;
   d_av = vx_av = vy_av = vz_av = P_av = 0.0;
@@ -1281,18 +1414,36 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int
   for (int kk = k - 1; kk <= k + 1; kk++) {
     for (int jj = j - 1; jj <= j + 1; jj++) {
       for (int ii = i - 1; ii <= i + 1; ii++) {
-        idn = ii + jj * nx + kk * nx * ny;
-        d   = conserved[grid_enum::density * ncells + idn];
-        mx  = conserved[grid_enum::momentum_x * ncells + idn];
-        my  = conserved[grid_enum::momentum_y * ncells + idn];
-        mz  = conserved[grid_enum::momentum_z * ncells + idn];
-        P   = (conserved[grid_enum::Energy * ncells + idn] - (0.5 / d) * (mx * mx + my * my + mz * mz)) * (gamma - 1.0);
+        if (ii <= stale_depth - 1 || ii >= nx - stale_depth || jj <= stale_depth - 1 || jj >= ny - stale_depth ||
+            kk <= stale_depth - 1 || kk >= nz - stale_depth) {
+          continue;
+        }
+
+        int idn = ii + jj * nx + kk * nx * ny;
+        Real d  = conserved[grid_enum::density * ncells + idn];
+        Real mx = conserved[grid_enum::momentum_x * ncells + idn];
+        Real my = conserved[grid_enum::momentum_y * ncells + idn];
+        Real mz = conserved[grid_enum::momentum_z * ncells + idn];
+        Real E  = conserved[grid_enum::Energy * ncells + idn];
+
+        // this function CAN use the "advected internal energy" field to compute pressure when the dual energy
+        // formalism. This is because this function has an explicit pre-condition that this function can only
+        // be applied when the "advected internal energy" field and the "total energy" fields are properly reconciled
+#ifdef DE
+        Real P = hydro_utilities::Get_Pressure_From_DE(
+            E, E - hydro_utilities::Calc_Kinetic_Energy_From_Momentum(d, mx, my, mz),
+            conserved[grid_enum::GasEnergy * ncells + id], gamma);
+#else
+        Real P = hydro_utilities::Calc_Pressure_Conserved(E, d, mx, my, mz, gamma);
+#endif
+
 #ifdef SCALAR
         for (int n = 0; n < NSCALARS; n++) {  // NOLINT
           scalar[n] = conserved[grid_enum::scalar * ncells + idn];
         }
 #endif
-        if (d > 0.0 && P > 0.0) {
+        Real d_inv = 1.0 / d;
+        if (d > 0.0 && P > 0.0 && not slow_check.is_slow(E, d, d_inv, mx * d_inv, my * d_inv, mz * d_inv, gamma)) {
           d_av += d;
           vx_av += mx;
           vy_av += my;
@@ -1304,11 +1455,21 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int
           }
 #endif
           N++;
+        } else if ((ii != i) || (jj != j) || (kk != k)) {
+#ifdef DE
+          Real Udens = conserved[grid_enum::GasEnergy * ncells + idn];
+#else
+          Real Udens =
+              -123456789;  // set to a dumb-looking number so that it's clear that it's not real when printing it
+#endif
+          printf("%3d %3d %3d skipped-neighbor: d: %e  E:%e  P:%e  vx:%e  vy:%e  vz:%e  Uadv:%e\n", ii, jj, kk, d, E, P,
+                 mx / d, my / d, mz / d, Udens);
         }
       }
     }
   }
 
+  // update the accumulator variables so that they now hold averages
   P_av  = P_av / N;
   vx_av = vx_av / d_av;
   vy_av = vy_av / d_av;
@@ -1318,17 +1479,18 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int
     scalar_av[n] = scalar_av[n] / d_av;
   }
 #endif
-  d_av = d_av / N;
+  d_av          = d_av / N;
+  Real Udens_av = P_av / (gamma - 1.0);
+  Real E_av     = Udens_av + 0.5 * d_av * math_utils::SquareMagnitude(vx_av, vy_av, vz_av);
 
   // replace cell values with new averaged values
   conserved[id + ncells * grid_enum::density]    = d_av;
   conserved[id + ncells * grid_enum::momentum_x] = d_av * vx_av;
   conserved[id + ncells * grid_enum::momentum_y] = d_av * vy_av;
   conserved[id + ncells * grid_enum::momentum_z] = d_av * vz_av;
-  conserved[id + ncells * grid_enum::Energy] =
-      P_av / (gamma - 1.0) + 0.5 * d_av * (vx_av * vx_av + vy_av * vy_av + vz_av * vz_av);
+  conserved[id + ncells * grid_enum::Energy]     = E_av;
 #ifdef DE
-  conserved[id + ncells * grid_enum::GasEnergy] = P_av / (gamma - 1.0);
+  conserved[id + ncells * grid_enum::GasEnergy] = Udens_av;
 #endif
 #ifdef SCALAR
   for (int n = 0; n < NSCALARS; n++) {  // NOLINT
@@ -1336,11 +1498,12 @@ __device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int
   }
 #endif
 
-  d = d_av;
-  E = P_av / (gamma - 1.0) + 0.5 * d_av * (vx_av * vx_av + vy_av * vy_av + vz_av * vz_av);
-  P = P_av;
+  // print out the values now that they have been replaced
+  printf("%3d %3d %3d FC: d: %e  E:%e  P:%e  vx:%e  vy:%e  vz:%e  Udens:%e\n", i, j, k, d_av, E_av, P_av, vx_av, vy_av,
+         vz_av, Udens_av);
 
-  printf("%3d %3d %3d FC: d: %e  E:%e  P:%e  vx:%e  vy:%e  vz:%e\n", i, j, k, d, E, P, vx_av, vy_av, vz_av);
+  // success requires that at least 2 cells contributed to the average
+  return N > 1;
 }
 
 void Apply_Scalar_Floor(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int field_num, Real scalar_floor)
diff --git a/src/hydro/hydro_cuda.h b/src/hydro/hydro_cuda.h
index 61b1073de..f8c48d95b 100644
--- a/src/hydro/hydro_cuda.h
+++ b/src/hydro/hydro_cuda.h
@@ -5,6 +5,7 @@
 #define HYDRO_CUDA_H
 
 #include "../global/global.h"
+#include "../hydro/average_cells.h"
 #include "../utils/mhd_utilities.h"
 
 __global__ void Update_Conserved_Variables_1D(Real *dev_conserved, Real *dev_F, int n_cells, int x_off, int n_ghost,
@@ -21,6 +22,10 @@ __global__ void Update_Conserved_Variables_3D(Real *dev_conserved, Real *Q_Lx, R
                                               Real gamma, int n_fields, int custom_grav, Real density_floor,
                                               Real *dev_potential);
 
+__global__ void PostUpdate_Conserved_Correct_Crashed_3D(Real *dev_conserved, int nx, int ny, int nz, int x_off,
+                                                        int y_off, int z_off, int n_ghost, Real gamma, int n_fields,
+                                                        SlowCellConditionChecker slow_check, int *any_error);
+
 /*!
  * \brief Determine the maximum inverse crossing time in a specific cell
  *
@@ -80,17 +85,6 @@ void Temperature_Ceiling(Real *dev_conserved, int nx, int ny, int nz, int n_ghos
                          Real T_ceiling);
 #endif  // TEMPERATURE CEILING
 
-#ifdef AVERAGE_SLOW_CELLS
-
-void Average_Slow_Cells(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy,
-                        Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound, Real zbound, int nx_offset,
-                        int ny_offset, int nz_offset);
-
-__global__ void Average_Slow_Cells_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx,
-                                      Real dy, Real dz, Real gamma, Real max_dti_slow, Real xbound, Real ybound,
-                                      Real zbound, int nx_offset, int ny_offset, int nz_offset);
-#endif
-
 void Apply_Temperature_Floor(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real U_floor);
 
 __global__ void Temperature_Floor_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
@@ -119,8 +113,49 @@ __global__ void Select_Internal_Energy_2D(Real *dev_conserved, int nx, int ny, i
 
 __global__ void Select_Internal_Energy_3D(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields);
 
-__device__ void Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields,
-                                        Real gamma, Real *conserved);
+/*! \brief Overwrites the values in the specified cell with the average of all the values from the (up to) 26
+ *  neighboring cells.
+ *
+ *  Care is taken when applying this logic to a cell near the edge of a block (where the entire simulation domain
+ *  is decomposed into 1 or more blocks).
+ *    * Recall that the entire reason we have ghost zones is that the stencil for computing flux-divergence can't
+ *      be applied uniformly to all cells -- the cells in the ghost zone can't be properly updated with the rest
+ *      the local block when applying the flux-divergence. We might refer to these cells that aren't properly
+ *      updated as being "stale". We refer to the width of the outer ring of stale values as the ``stale-depth``
+ *    * For concreteness, consider a pure hydro/mhd simulation using the VL integrator:
+ *       - Right after refreshing the ghost-zones, the stale_depth is 0
+ *       - After the partial time-step, the stale_depth is 1.
+ *       - After the full timestep, the stale depth depends on the choice of reconstruction. (e.g. it is 2 for
+ *         for nearest neighbor and 3 for plmp).
+ *       - The ghost-depth should always be equal to the max stale-depth at the end of a simulation cycle (if
+ *         ghost-depth is bigger, extra work is done. If it's smaller, then your simulation is wrong)
+ *    * To respect the simulations boundaries, values in "stale" cells are excluded from the averages. If
+ *      stale-depth is 0, then values from beyond the edge of the simulation are excluded from averages
+ *
+ *  \pre When using the dual-energy formalism, this function should **ONLY** be called in contexts where all
+ *    "non-crashed" cells are known to have *reconciled* "advected internal energy" and "total energy" field values.
+ *    This is important for averaging.
+ *        - In practice, this limitation is mostly meaningful inside of a hydro solver
+ *        - Outside of a hydro-solver, you can definitely come up with pathological cases where applying source terms,
+ *          but you would need to go out of your way to do something to violate this precondition
+ *
+ *  \returns A value of ``true`` indicates that the operation succeeded. A value of ``false`` indicates a failure.
+ *    To succeed, this function requires that there are at least 2 neighboring cells with valid values that can be
+ *    used to compute the average.
+ *
+ *  \note
+ *  From a perfectionist's perspective, one could argue that we really should increment the stale-depth whenever
+ *  we call this function (in other words, we should have an extra layer of ghost zones for each time we call
+ *  this function).
+ *    * rationale: if we don't, then the the number of neighbors considered results of the simulation can vary
+ *      based on how close a cell is to a block-edge (the number of cells varies from 7 to 26).
+ *    * more pragmatically: this probably doesn't matter a whole lot given that this piece of machinery is a
+ *      band-aid solution to begin with.
+ *    * Aside: a similar argument could be made for the energy-synchronization step of the dual-energy formalism.
+ */
+__device__ bool Average_Cell_All_Fields(int i, int j, int k, int nx, int ny, int nz, int ncells, int n_fields,
+                                        Real gamma, Real *conserved, int stale_depth,
+                                        SlowCellConditionChecker slow_check);
 
 __device__ Real Average_Cell_Single_Field(int field_indx, int i, int j, int k, int nx, int ny, int nz, int ncells,
                                           Real *conserved);
diff --git a/src/integrators/VL_3D_cuda.cu b/src/integrators/VL_3D_cuda.cu
index 0f9ccc013..074f6cdff 100644
--- a/src/integrators/VL_3D_cuda.cu
+++ b/src/integrators/VL_3D_cuda.cu
@@ -35,7 +35,8 @@ __global__ void Update_Conserved_Variables_3D_half(Real *dev_conserved, Real *de
 
 void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off,
                           int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound,
-                          Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential)
+                          Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential,
+                          const SlowCellConditionChecker &slow_check, int *any_error)
 {
   // Here, *dev_conserved contains the entire
   // set of conserved variables on the grid
@@ -392,6 +393,19 @@ void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int
   GPU_Error_Check();
   #endif  // DE
 
+  // Step 7: Address any crashed threads
+  // -> it's REALLY important that we do this **AFTER** the dual energy formalism update
+  // -> in more detail, galaxy simulations with particle-feedback, seem to commonly produce scenarios where a cell
+  //    crashes and all of its neighbors have positive total energies that are smaller than the respective kinetic
+  //    energies (this problem might be resolved if we use larger deposition stencils)
+  // -> by positioning this correction after the dual energy formalism, we can correct for this issue.
+  cuda_utilities::AutomaticLaunchParams static const post_update_correction_launch_params(
+      PostUpdate_Conserved_Correct_Crashed_3D, n_cells);
+  hipLaunchKernelGGL(PostUpdate_Conserved_Correct_Crashed_3D, post_update_correction_launch_params.get_numBlocks(),
+                     post_update_correction_launch_params.get_threadsPerBlock(), 0, 0, dev_conserved, nx, ny, nz, x_off,
+                     y_off, z_off, n_ghost, gama, n_fields, slow_check, any_error);
+  GPU_Error_Check();
+
   return;
 }
 
diff --git a/src/integrators/VL_3D_cuda.h b/src/integrators/VL_3D_cuda.h
index 4b80a4604..b7af66efb 100644
--- a/src/integrators/VL_3D_cuda.h
+++ b/src/integrators/VL_3D_cuda.h
@@ -5,10 +5,12 @@
 #define VL_3D_CUDA_H
 
 #include "../global/global.h"
+#include "../hydro/average_cells.h"
 
 void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off,
                           int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound,
-                          Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential);
+                          Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential,
+                          const SlowCellConditionChecker &slow_check, int *any_error);
 
 void Free_Memory_VL_3D();
 
diff --git a/src/integrators/simple_3D_cuda.cu b/src/integrators/simple_3D_cuda.cu
index 2e834e370..3e38e1dbf 100644
--- a/src/integrators/simple_3D_cuda.cu
+++ b/src/integrators/simple_3D_cuda.cu
@@ -24,7 +24,8 @@
 
 void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off,
                               int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound,
-                              Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential)
+                              Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential,
+                              const SlowCellConditionChecker &slow_check, int *any_error)
 {
   // Here, *dev_conserved contains the entire
   // set of conserved variables on the grid
@@ -154,6 +155,9 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx,
                      zbound, dt, gama, n_fields, custom_grav, density_floor, dev_grav_potential);
   GPU_Error_Check();
 
+  // BE AWARE: any "crashed cells" aren't handled yet! We explicitly defer handling of these cases until after
+  // we handle the Dual Energy Formalism
+
   #ifdef DE
   hipLaunchKernelGGL(Select_Internal_Energy_3D, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost,
                      n_fields);
@@ -161,6 +165,14 @@ void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx,
   GPU_Error_Check();
   #endif
 
+  // Step 4: Address any crashed threads
+  cuda_utilities::AutomaticLaunchParams static const post_update_correction_launch_params(
+      PostUpdate_Conserved_Correct_Crashed_3D, n_cells);
+  hipLaunchKernelGGL(PostUpdate_Conserved_Correct_Crashed_3D, post_update_correction_launch_params.get_numBlocks(),
+                     post_update_correction_launch_params.get_threadsPerBlock(), 0, 0, dev_conserved, nx, ny, nz, x_off,
+                     y_off, z_off, n_ghost, gama, n_fields, slow_check, any_error);
+  GPU_Error_Check();
+
   return;
 }
 
diff --git a/src/integrators/simple_3D_cuda.h b/src/integrators/simple_3D_cuda.h
index 847b93c61..6f262a1d7 100644
--- a/src/integrators/simple_3D_cuda.h
+++ b/src/integrators/simple_3D_cuda.h
@@ -9,7 +9,8 @@
 
 void Simple_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int ny, int nz, int x_off, int y_off,
                               int z_off, int n_ghost, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound,
-                              Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential);
+                              Real dt, int n_fields, int custom_grav, Real density_floor, Real *host_grav_potential,
+                              const SlowCellConditionChecker &slow_check, int *any_error);
 
 void Free_Memory_Simple_3D();
 
diff --git a/src/utils/gpu.hpp b/src/utils/gpu.hpp
index b4eac6607..196db01e4 100644
--- a/src/utils/gpu.hpp
+++ b/src/utils/gpu.hpp
@@ -134,20 +134,18 @@ static constexpr int maxWarpsPerBlock = 1024 / WARPSIZE;
 #define GPU_MAX_THREADS 256
 
 /*!
- * \brief Check for CUDA/HIP error codes. Can be called wrapping a GPU function that returns a value or with no
- * arguments and it will get the latest error code.
+ * \brief formats and reports CUDA/HIP errors. This a helper function invoked by GPU_Error_Check
  *
  * \param[in] code The code to check. Defaults to the last error code
- * \param[in] abort Whether or not to abort if an error is encountered. Defaults to True
+ * \param[in] abort Whether or not to abort if an error is encountered.
  * \param[in] location The location of the call. This should be left as the default value.
  */
-inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort = true,
-                            std::experimental::source_location location = std::experimental::source_location::current())
+inline void Format_GPU_Error_Check_Err_(cudaError_t code, bool abort, std::experimental::source_location location)
 {
-#ifndef DISABLE_GPU_ERROR_CHECKING
-  code = cudaDeviceSynchronize();
+  // this function should ALWAYS be compiled, (regardless of whether DISABLE_GPU_ERROR_CHECKING is
+  // defined or not). We will instead conditionally compile calls to this of this function based on
+  // whether DISABLE_GPU_ERROR_CHECKING is defined.
 
-  // Check the code
   if (code != cudaSuccess) {
     std::cout << "GPU_Error_Check: Failed at "
               << "Line: " << location.line() << ", File: " << location.file_name()
@@ -156,6 +154,50 @@ inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort
       chexit(code);
     }
   }
+}
+
+/*!
+ * \brief Check for CUDA/HIP error codes. Can be called wrapping a GPU function that returns a value or with no
+ * arguments and it will get the latest error code.
+ *
+ * This function will always check any cuda-exit code explicitly passed into it. If no code is passed, the
+ * function assumes that it's being called immediately after a kernel was launched and performs a cheap
+ * query to make sure that there aren't currently any errors from the kernel launch (this will pick up on any
+ * errors if you haven't recently performed an error check)
+ *
+ * Unless the DISABLE_GPU_ERROR_CHECKING macro is defined, this function will then explicitly synchronize the
+ * device with the host in order to check if there were any errors during the kernel's evaluation.
+ *
+ * \param[in] code The code to check. Defaults to the last error code
+ * \param[in] abort Whether or not to abort if an error is encountered. Defaults to True
+ * \param[in] location The location of the call. This should be left as the default value.
+ *
+ * @note
+ * The fact that we include function calls as default arguments may appear irregular. If the caller doesn't override
+ * the default values, then the functions are guaranteed to be freshly called whenever we invoke ``GPU_Error_Check``
+ * (details are provided on https://en.cppreference.com/w/cpp/language/default_arguments)
+ *
+ * @note
+ * As explained at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#error-checking
+ * if we want a call to this function right after a kernel launch to be able to detect errors that prevented the
+ * kernel launch it is VERY important that the function explicitly checks an error code retrieved with either
+ * ``cudaPeekAtLastError()`` or ``cudaGetLastError()``. If we don't do this, the function won't automatically
+ * detect launch errors.
+ */
+inline void GPU_Error_Check(cudaError_t code = cudaPeekAtLastError(), bool abort = true,
+                            std::experimental::source_location location = std::experimental::source_location::current())
+{
+  // first, we check the argument passed into this function
+  // - earlier versions of this function did not perform this check.
+  // - as noted up in the docstring, it's extremely important to perform this check in addition to the next
+  //   check because the next check won't identify cases where the kernel fails to launch
+  // - there's basically no reason to disable this check (it is extremely cheap)
+  if (code != cudaSuccess) Format_GPU_Error_Check_Err_(code, abort, location);
+
+#ifndef DISABLE_GPU_ERROR_CHECKING
+  // wait for any kernels being evaluated to finish and check if any errors arose in the process
+  code = cudaDeviceSynchronize();
+  if (code != cudaSuccess) Format_GPU_Error_Check_Err_(code, abort, location);
 #endif  // DISABLE_GPU_ERROR_CHECKING
 }