diff --git a/.azure-pipelines/xfel/download-source.yml b/.azure-pipelines/xfel/download-source.yml
index b59af39774..7c2b32d412 100644
--- a/.azure-pipelines/xfel/download-source.yml
+++ b/.azure-pipelines/xfel/download-source.yml
@@ -61,7 +61,9 @@ jobs:
   # preserve permissions and delete extra files
   - script: |
       cd $(Pipeline.Workspace)
+      mv modules/kokkos/.git modules/kokkos/.git.tmp
       rm -fr modules/*/.git/*
+      mv modules/kokkos/.git.tmp modules/kokkos/.git
       rm -fr modules/*/.svn/*
       rm -fr modules/*.tar
       tar -cf modules.tar modules
diff --git a/.azure-pipelines/xfel/unix-conda-build.yml b/.azure-pipelines/xfel/unix-conda-build.yml
index 3766d91335..bece4d695b 100644
--- a/.azure-pipelines/xfel/unix-conda-build.yml
+++ b/.azure-pipelines/xfel/unix-conda-build.yml
@@ -95,6 +95,7 @@ steps:
       libtbx.configure lunus
       make
     fi
+    rm -fr modules/kokkos/.git/*
   displayName: Configure and Build
 
 # test
diff --git a/kokkostbx/Legacy/kokkos_matrix.h b/kokkostbx/Legacy/kokkos_matrix.h
index 140d69e01d..4a7f07077a 100644
--- a/kokkostbx/Legacy/kokkos_matrix.h
+++ b/kokkostbx/Legacy/kokkos_matrix.h
@@ -156,7 +156,7 @@ namespace kokkostbx {
         }
 
         KOKKOS_INLINE_FUNCTION NumType length() const {
-            return ::Kokkos::Experimental::sqrt(length_sqr());
+            return ::Kokkos::sqrt(length_sqr());
         }
 
         KOKKOS_INLINE_FUNCTION NumType dot(const matrix<NumType>& v) const {
diff --git a/kokkostbx/Legacy/kokkos_matrix3.h b/kokkostbx/Legacy/kokkos_matrix3.h
index 96105beb3a..561f6b36b8 100644
--- a/kokkostbx/Legacy/kokkos_matrix3.h
+++ b/kokkostbx/Legacy/kokkos_matrix3.h
@@ -147,7 +147,7 @@ namespace kokkostbx {
         }
 
         KOKKOS_INLINE_FUNCTION NumType length() const {
-            return ::Kokkos::Experimental::sqrt(length_sqr());
+            return ::Kokkos::sqrt(length_sqr());
         }
 
         KOKKOS_INLINE_FUNCTION NumType dot(const matrix3<NumType>& v) const {
diff --git a/kokkostbx/Legacy/kokkos_vector.h b/kokkostbx/Legacy/kokkos_vector.h
index 76beaaa245..f34de4586e 100644
--- a/kokkostbx/Legacy/kokkos_vector.h
+++ b/kokkostbx/Legacy/kokkos_vector.h
@@ -215,7 +215,7 @@ namespace kokkostbx {
         }
 
         KOKKOS_INLINE_FUNCTION NumType length() const {
-            return ::Kokkos::Experimental::sqrt(length_sqr());
+            return ::Kokkos::sqrt(length_sqr());
         }
 
         KOKKOS_INLINE_FUNCTION NumType dot(const vector3<NumType>& v) const {
@@ -248,8 +248,8 @@ namespace kokkostbx {
 
         // rotate a point about a unit vector3 axis
         KOKKOS_INLINE_FUNCTION vector3<NumType> rotate_around_axis(const vector3<NumType>& axis, NumType angle) const {
-            NumType sinphi = ::Kokkos::Experimental::sin(angle);
-            NumType cosphi = ::Kokkos::Experimental::cos(angle);
+            NumType sinphi = ::Kokkos::sin(angle);
+            NumType cosphi = ::Kokkos::cos(angle);
             NumType dot_factor = axis.dot(*this) * (1.0-cosphi);
 
             vector3<NumType> vector_rot = axis.cross(*this) * sinphi;
diff --git a/kokkostbx/Legacy/kokkos_vector3.h b/kokkostbx/Legacy/kokkos_vector3.h
index 802a3c8e26..39a316a9dc 100644
--- a/kokkostbx/Legacy/kokkos_vector3.h
+++ b/kokkostbx/Legacy/kokkos_vector3.h
@@ -144,7 +144,7 @@ namespace kokkostbx {
         }
 
         KOKKOS_INLINE_FUNCTION NumType length() const {
-            return ::Kokkos::Experimental::sqrt(length_sqr());
+            return ::Kokkos::sqrt(length_sqr());
         }
 
         KOKKOS_INLINE_FUNCTION NumType dot(const vector3<NumType>& v) const {
@@ -177,8 +177,8 @@ namespace kokkostbx {
 
         // rotate a point about a unit vector3 axis
         KOKKOS_INLINE_FUNCTION vector3<NumType> rotate_around_axis(const vector3<NumType>& axis, NumType angle) const {
-            NumType sinphi = ::Kokkos::Experimental::sin(angle);
-            NumType cosphi = ::Kokkos::Experimental::cos(angle);
+            NumType sinphi = ::Kokkos::sin(angle);
+            NumType cosphi = ::Kokkos::cos(angle);
             NumType dot_factor = axis.dot(*this) * (1.0-cosphi);
 
             vector3<NumType> vector_rot = axis.cross(*this) * sinphi;
diff --git a/kokkostbx/SConscript b/kokkostbx/SConscript
index 61212f0261..19e522c6f8 100644
--- a/kokkostbx/SConscript
+++ b/kokkostbx/SConscript
@@ -22,7 +22,7 @@ if env_etc.enable_kokkos:
   if os.getenv('KOKKOS_ARCH') is None:
     os.environ['KOKKOS_ARCH'] = "HSW"
   if use_cuda and os.getenv('KOKKOS_CUDA_OPTIONS') is None:
-    os.environ['KOKKOS_CUDA_OPTIONS'] = "enable_lambda,force_uvm"
+    os.environ['KOKKOS_CUDA_OPTIONS'] = "enable_lambda"
   os.environ['CXXFLAGS'] = '-O3 -fPIC -DCUDAREAL=double'
 
   library_flags = "-Llib"
@@ -37,10 +37,6 @@ if env_etc.enable_kokkos:
     linked_libraries += " -lcudart -lcuda"
   os.environ['LDLIBS'] = linked_libraries
 
-  cxx_standard = '14'
-  if use_sycl:
-    cxx_standard = '17'
-
   original_cxx = None
   kokkos_lib = 'libkokkos.a'
   kokkos_cxxflags = None
@@ -119,7 +115,7 @@ if env_etc.enable_kokkos:
     returncode = subprocess.call([
         'cmake',
         os.environ['KOKKOS_PATH'],
-        '-DCMAKE_CXX_STANDARD={}'.format(cxx_standard),
+        '-DCMAKE_CXX_STANDARD={}'.format('17'),
         '-DCMAKE_INSTALL_PREFIX={}'.format(libtbx.env.under_build('.')),
         '-DCMAKE_INSTALL_LIBDIR=lib',
         '-DBUILD_SHARED_LIBS={}'.format(OnOff[True]),
@@ -138,9 +134,9 @@ if env_etc.enable_kokkos:
         '-DKokkos_ENABLE_SERIAL=ON',
         '-DKokkos_ENABLE_OPENMP={}'.format(OnOff[use_openmp]),
         '-DKokkos_ENABLE_CUDA={}'.format(OnOff[use_cuda]),
-        '-DKokkos_ENABLE_CUDA_UVM={}'.format(OnOff[use_cuda]),
         '-DKokkos_ENABLE_HIP={}'.format(OnOff[use_hip]),
-        '-DKokkos_ENABLE_SYCL={}'.format(OnOff[use_sycl])
+        '-DKokkos_ENABLE_SYCL={}'.format(OnOff[use_sycl]),
+        '-DKokkos_ENABLE_IMPL_MDSPAN=ON'
       ],
       cwd=kokkos_build_dir)
 
diff --git a/kokkostbx/kokkos_types.h b/kokkostbx/kokkos_types.h
index 619fa20568..3ea314437c 100644
--- a/kokkostbx/kokkos_types.h
+++ b/kokkostbx/kokkos_types.h
@@ -3,10 +3,10 @@
 #include <Kokkos_Core.hpp>
 
 #ifdef KOKKOS_ENABLE_CUDA
-    #define MemSpace Kokkos::CudaUVMSpace
+    #define MemSpace Kokkos::CudaSpace
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-    #define MemSpace Kokkos::Experimental::HIPSpace
+    #define MemSpace Kokkos::HIPSpace
 #endif
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     #define MemSpace Kokkos::OpenMPTargetSpace
@@ -19,8 +19,11 @@
 using ExecSpace = MemSpace::execution_space;
 using range_policy = Kokkos::RangePolicy<ExecSpace>;
 
-template <typename T>
-using view_1d_t = Kokkos::View<T*, MemSpace>;
+template <typename T> using view_1d_t = Kokkos::View<T*, MemSpace>;
+template <typename T> using view_4d_t = Kokkos::View<T****, MemSpace>;
+template <typename T> using view_5d_t = Kokkos::View<T*****, MemSpace>;
+template <typename T> using view_6d_t = Kokkos::View<T******, MemSpace>;
+template <typename T> using view_6d6_t = Kokkos::View<T******[6], MemSpace>;
 
 using vector_bool_t = view_1d_t<bool>;
 using vector_double_t = view_1d_t<double>;
diff --git a/kokkostbx/kokkos_utils.cpp b/kokkostbx/kokkos_utils.cpp
index ee46cc35d5..02bb35994b 100644
--- a/kokkostbx/kokkos_utils.cpp
+++ b/kokkostbx/kokkos_utils.cpp
@@ -18,4 +18,20 @@ void transfer_double2kokkos(vector_cudareal_t& dst, const double* src, const siz
     }
 }
 
+void transfer_vector2kokkos(view_1d_t<bool>& dst, const std::vector<bool>& src) {
+    if (true) {
+        // printf("== Transfer %s from %p\n", dst.label().c_str(), (void*) dst.data());
+        // printf(" - size src|dst: %d|%d\n", src.size(), dst.span() );
+    }
+    if (dst.span() < src.size()) {
+        resize(dst, src.size());
+        // printf(" - size changed, new size: %d\n", dst.span() );
+    }
+    auto host_view = Kokkos::create_mirror_view(dst);
+    for (int i = 0; i < src.size(); ++i) {
+        host_view(i) = src[i];
+    }
+    Kokkos::deep_copy(dst, host_view);
+}
+
 }  // namespace kokkostbx
diff --git a/kokkostbx/kokkos_utils.h b/kokkostbx/kokkos_utils.h
index e82a776ebc..4b0105ab0b 100644
--- a/kokkostbx/kokkos_utils.h
+++ b/kokkostbx/kokkos_utils.h
@@ -78,6 +78,8 @@ void transfer_kokkos2shared(af::shared<T>& dst, const view_1d_t<T>& src) {
     transfer_kokkos2X(dst, src);
 }
 
+void transfer_vector2kokkos(view_1d_t<bool>& dst, const std::vector<bool>& src);
+
 template <typename T>
 void transfer_vector2kokkos(view_1d_t<T>& dst, const std::vector<T>& src) {
     if (true) {
@@ -88,11 +90,9 @@ void transfer_vector2kokkos(view_1d_t<T>& dst, const std::vector<T>& src) {
         resize(dst, src.size());
         // printf(" - size changed, new size: %d\n", dst.span() );
     }
-    auto host_view = Kokkos::create_mirror_view(dst);
-    for (int i = 0; i < src.size(); ++i) {
-        host_view(i) = src[i];
-    }
-    Kokkos::deep_copy(dst, host_view);
+    auto host_view = Kokkos::View<const T*, Kokkos::HostSpace>(src.data(), src.size());
+    auto dst_subview = Kokkos::subview(dst, std::pair<int, int>(0, src.size()));
+    Kokkos::deep_copy(dst_subview, host_view);
 }
 
 template <typename T>
diff --git a/kokkostbx/kokkos_vector.h b/kokkostbx/kokkos_vector.h
index e31d905bb6..5bda8e86d7 100644
--- a/kokkostbx/kokkos_vector.h
+++ b/kokkostbx/kokkos_vector.h
@@ -207,8 +207,9 @@ struct vector_base {
     }
 
     KOKKOS_FUNCTION void operator/=(const NumType& v) {
+        const NumType v_r = 1 / v;
         for (size_t i = 0; i < size; ++i) {
-            data[i] /= v;
+            data[i] *= v_r;
         }
     }
 
@@ -261,14 +262,15 @@ struct vector_base {
 
     KOKKOS_FUNCTION NumType length() const {
         // return sqrt_func(length_sqr());
-        return ::Kokkos::Experimental::sqrt(length_sqr());
+        return ::Kokkos::sqrt(length_sqr());
     }
 
     KOKKOS_FUNCTION void normalize() {
         NumType l = length();
         if (l > 0) {
+            NumType l_r = 1 / l;
             for (size_t i = 0; i < size; ++i) {
-                data[i] /= l;
+                data[i] *= l_r;
             }
         }
     }
@@ -277,8 +279,9 @@ struct vector_base {
         NumType l = length();
         Derived unit_vector{};
         if (l > 0) {
+            NumType l_r = 1 / l;
             for (size_t i = 0; i < size; ++i) {
-                unit_vector[i] = data[i] / l;
+                unit_vector[i] = data[i] * l_r;
             }
         }
         return unit_vector;
diff --git a/kokkostbx/kokkos_vector3.h b/kokkostbx/kokkos_vector3.h
index 98d46ee7dc..058540eb07 100644
--- a/kokkostbx/kokkos_vector3.h
+++ b/kokkostbx/kokkos_vector3.h
@@ -10,8 +10,8 @@
 
 // #ifdef KOKKOS_CORE_HPP
 //     template <typename T> KOKKOS_FUNCTION T sin_func(T x) { return
-//     ::Kokkos::Experimental::sin(x); } template <typename T> KOKKOS_FUNCTION T cos_func(T x) {
-//     return ::Kokkos::Experimental::cos(x); }
+//     ::Kokkos::sin(x); } template <typename T> KOKKOS_FUNCTION T cos_func(T x) {
+//     return ::Kokkos::cos(x); }
 // #else
 //     #include <cmath>
 //     template <typename T> KOKKOS_FUNCTION T sin_func(T x) { return sin(x); }
@@ -27,26 +27,26 @@ struct vector3 : public vector<NumType, 3> {
     using vector_base = kokkostbx::vector<NumType, 3>;
 
     vector3() = default;
-    KOKKOS_FUNCTION vector3(NumType val) : vector_base(val){};
-    KOKKOS_FUNCTION vector3(NumType arr[]) : vector_base(arr){};
-    KOKKOS_FUNCTION vector3(const vector_base& vec) : vector_base(vec){};
+    KOKKOS_INLINE_FUNCTION vector3(NumType val) : vector_base(val){};
+    KOKKOS_INLINE_FUNCTION vector3(NumType arr[]) : vector_base(arr){};
+    KOKKOS_INLINE_FUNCTION vector3(const vector_base& vec) : vector_base(vec){};
 
-    KOKKOS_FUNCTION vector3(NumType x, NumType y, NumType z) : vector_base() {
+    KOKKOS_INLINE_FUNCTION vector3(NumType x, NumType y, NumType z) : vector_base() {
         vector_base::data[0] = x;
         vector_base::data[1] = y;
         vector_base::data[2] = z;
     }
 
     // decided against using properties, as this would increase the size of the class
-    KOKKOS_FUNCTION NumType& x_val() { return vector_base::data[0]; }
-    KOKKOS_FUNCTION NumType& y_val() { return vector_base::data[1]; }
-    KOKKOS_FUNCTION NumType& z_val() { return vector_base::data[2]; }
+    KOKKOS_INLINE_FUNCTION NumType& x_val() { return vector_base::data[0]; }
+    KOKKOS_INLINE_FUNCTION NumType& y_val() { return vector_base::data[1]; }
+    KOKKOS_INLINE_FUNCTION NumType& z_val() { return vector_base::data[2]; }
 
-    KOKKOS_FUNCTION NumType x_val() const { return vector_base::data[0]; }
-    KOKKOS_FUNCTION NumType y_val() const { return vector_base::data[1]; }
-    KOKKOS_FUNCTION NumType z_val() const { return vector_base::data[2]; }
+    KOKKOS_INLINE_FUNCTION NumType x_val() const { return vector_base::data[0]; }
+    KOKKOS_INLINE_FUNCTION NumType y_val() const { return vector_base::data[1]; }
+    KOKKOS_INLINE_FUNCTION NumType z_val() const { return vector_base::data[2]; }
 
-    KOKKOS_FUNCTION vector3<NumType> cross(const vector3<NumType>& v) const {
+    KOKKOS_INLINE_FUNCTION vector3<NumType> cross(const vector3<NumType>& v) const {
         vector3<NumType> cross_vector{};
         cross_vector.x_val() = y_val() * v.z_val() - z_val() * v.y_val();
         cross_vector.y_val() = z_val() * v.x_val() - x_val() * v.z_val();
@@ -56,14 +56,14 @@ struct vector3 : public vector<NumType, 3> {
     }
 
     // rotate a point around a unit vector3 axis
-    KOKKOS_FUNCTION vector3<NumType> rotate_around_axis(const vector3<NumType>& axis, NumType angle)
+    KOKKOS_INLINE_FUNCTION vector3<NumType> rotate_around_axis(const vector3<NumType>& axis, NumType angle)
         const {
         // NumType sinphi = sin_func(angle);
         // NumType cosphi = cos_func(angle);
-        NumType sinphi = ::Kokkos::Experimental::sin(angle);
-        NumType cosphi = ::Kokkos::Experimental::cos(angle);
+        const NumType sinphi = ::Kokkos::sin(angle);
+        const NumType cosphi = ::Kokkos::cos(angle);
 
-        NumType dot_factor = axis.dot(*this) * (1.0 - cosphi);
+        const NumType dot_factor = axis.dot(*this) * (1.0 - cosphi);
 
         vector3<NumType> vector_rot = axis.cross(*this) * sinphi;
         vector_rot += axis * dot_factor;
diff --git a/libtbx/auto_build/bootstrap.py b/libtbx/auto_build/bootstrap.py
index 5fbfca691d..fc9ce3f3cc 100644
--- a/libtbx/auto_build/bootstrap.py
+++ b/libtbx/auto_build/bootstrap.py
@@ -1006,17 +1006,17 @@ class xia2_module(SourceModule):
 
 class kokkos_module(SourceModule):
   module = 'kokkos'
-  anonymous = ['git', '-b 3.7.01',
+  anonymous = ['git', '-b 4.2.00',
                'git@github.com:kokkos/kokkos.git',
                'https://github.com/kokkos/kokkos.git',
-               'https://github.com/kokkos/kokkos/archive/refs/tags/3.7.01.zip']
+               'https://github.com/kokkos/kokkos/archive/refs/tags/4.2.00.zip']
 
 class kokkos_kernels_module(SourceModule):
   module = 'kokkos-kernels'
-  anonymous = ['git', '-b 3.7.01',
+  anonymous = ['git', '-b 4.2.00',
                'git@github.com:kokkos/kokkos-kernels.git',
                'https://github.com/kokkos/kokkos-kernels.git',
-               'https://github.com/kokkos/kokkos-kernels/archive/refs/tags/3.7.01.zip']
+               'https://github.com/kokkos/kokkos-kernels/archive/refs/tags/4.2.00.zip']
 
 # Duke repositories
 class probe_module(SourceModule):
diff --git a/simtbx/command_line/complete_an_F.py b/simtbx/command_line/complete_an_F.py
new file mode 100644
index 0000000000..1bf34dad3e
--- /dev/null
+++ b/simtbx/command_line/complete_an_F.py
@@ -0,0 +1,48 @@
+from __future__ import division, print_function
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("mtzin", help="input mtz file", type=str)
+parser.add_argument("mtzout", help="output mtz file", type=str)
+args = parser.parse_args()
+
+# LIBTBX_SET_DISPATCHER_NAME diffBragg.completeF
+
+import numpy as np
+from iotbx.reflection_file_reader import any_reflection_file
+from dials.array_family import flex
+from scipy.interpolate import interp1d
+from cctbx import miller
+
+F = any_reflection_file(args.mtzin).as_miller_arrays()[0]
+F = F.as_amplitude_array()
+if not F.is_xray_amplitude_array():
+    F = F.set_observation_type_xray_amplitude()
+
+print("Bin-ID    Res-range    Completeness    #ASU-indices")
+F.show_completeness()
+d_max,d_min = F.resolution_range()
+print("d_min, d_max (Angstrom): ", d_min, d_max)
+mset_full = F.build_miller_set(False, d_min=d_min)
+mset_full_d = {h: d for h,d in zip(mset_full.d_spacings().indices(), mset_full.d_spacings().data())}
+Fmap = {h:val for h,val in zip(F.indices(), F.data())}
+xvals = np.array(F.d_spacings().data())
+yvals = np.array(F.data())
+fill_vals = yvals[np.argmin(xvals)], yvals[np.argmax(xvals)]
+I = interp1d(xvals, yvals, fill_value=fill_vals, bounds_error=False)
+data = []
+for h in mset_full.indices():
+    if h not in Fmap:
+        d_h = mset_full_d[h]
+        amp = I(d_h)
+    else:
+        amp = Fmap[h]
+    data.append(amp)
+
+complete_amps = flex.double(data)
+complete_inds = mset_full.indices()
+ma = miller.array(mset_full, complete_amps)
+if not ma.is_xray_amplitude_array():
+    ma = ma.set_observation_type_xray_amplitude()
+ma = ma.as_anomalous_array()
+assert ma.anomalous_flag()
+ma.as_mtz_dataset(column_root_label="F").mtz_object().write(args.mtzout)
diff --git a/simtbx/command_line/estimate_Ncells_Eta.py b/simtbx/command_line/estimate_Ncells_Eta.py
new file mode 100644
index 0000000000..d6ba9476a4
--- /dev/null
+++ b/simtbx/command_line/estimate_Ncells_Eta.py
@@ -0,0 +1,111 @@
+from __future__ import division
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("dirname", help="still process output folder", type=str)
+parser.add_argument("--updatePhil", default=None, help="name of an exisiting stage 1 phil file  to update (just the init.Ncells portion)", type=str)
+parser.add_argument("--expSuffix", help="extension of refined experiments", type=str, default="_refined.expt")
+parser.add_argument("--thresh", type=float, default=7, help="MAD score for outliers (default=7 standard deviation above the median)")
+parser.add_argument("--useMean", action="store_true", help="set Eta and Nabc using the mean (default is median)")
+parser.add_argument("--NabcMax",  type=float, default=70, help="If estaimated Nabc is above this value, it will set to this value")
+parser.add_argument("--NabcMin",  type=float, default=5, help="If estaimated Nabc is BELOW this value, it will be set to this value")
+parser.add_argument("--EtaMax", type=float, default=0.5, help="If estimated Eta is above this range, it will be set to this value")
+parser.add_argument("--EtaMin", type=float, default=1e-3, help="If estimated Eta is BELOW this range, it will be set to this value")
+
+#parser.add_argument("--njobs", type=int, default=5, help="number of jobs (only runs on single node, no MPI)")
+parser.add_argument("--plot", action="store_true", help="show a histogram at the end")
+args = parser.parse_args()
+# LIBTBX_SET_DISPATCHER_NAME diffBragg.estimate_Ncells_Eta
+from mpi4py import MPI
+COMM = MPI.COMM_WORLD
+#from joblib import Parallel, delayed
+import json
+import numpy as np
+from cctbx import uctbx
+from scitbx.matrix import sqr
+import os
+import glob
+from dxtbx.model import ExperimentList
+
+glob_s = os.path.join(args.dirname, "*%s" % args.expSuffix)
+fnames = glob.glob(glob_s)
+
+#def main(jid):
+all_Ns = []
+all_mos_spreads = []
+for i, f in enumerate(fnames):
+    if i % COMM.size != COMM.rank:
+        continue
+    print(f)
+    #Cs = ExperimentList.from_file(f, False).crystals()
+    Cs = json.load(open(f, 'r'))['crystal']
+    dom_sizes = np.array([C['ML_domain_size_ang'] for C in Cs])
+    mos_spreads = [2*C['ML_half_mosaicity_deg'] for C in Cs]
+    uc_vols = []
+    for C in Cs:
+        a = C['real_space_a']
+        b = C['real_space_b']
+        c = C['real_space_c']
+        uc = uctbx.unit_cell(orthogonalization_matrix=sqr(a + b + c).transpose())
+        uc_vols.append(uc.volume())
+
+    Ns = dom_sizes / np.power(uc_vols, 1 / 3.)
+    all_Ns += list(Ns)
+    all_mos_spreads += list(mos_spreads)
+#    return all_Ns, all_mos_spreads
+
+all_Ns = COMM.reduce(all_Ns)
+all_mos_spreads = COMM.reduce(all_mos_spreads)
+#results = Parallel(n_jobs=args.njobs)(delayed(main)(j) for j in range(args.njobs))
+#all_Ns = []
+#all_mos_spreads = []
+#for N,mos in results:
+#    all_Ns += N
+#    all_mos_spreads += mos
+
+if COMM.rank==0:
+    import pandas
+    import pylab as plt
+    from simtbx.diffBragg import utils
+    all_Ns = np.array(all_Ns)
+    all_mos_spreads = np.array(all_mos_spreads)
+    bad_Ns = utils.is_outlier(all_Ns, args.thresh)
+    bad_mos_spreads = utils.is_outlier(all_mos_spreads, args.thresh)
+    is_bad = np.logical_or(bad_Ns, bad_mos_spreads)
+    print("Removing %d outlier estiamtes" % is_bad.sum())
+    all_Ns = all_Ns[~is_bad]
+    all_mos_spreads = all_mos_spreads[~is_bad]
+
+    df = pandas.DataFrame({"Ncells": all_Ns, "mos_spread_deg": all_mos_spreads})
+    print(df.Ncells.describe())
+    print(df.mos_spread_deg.describe())
+    if args.useMean:
+        mean_N = df.Ncells.mean()
+        mean_mos = df.mos_spread_deg.mean()
+    else:
+        mean_N = df.Ncells.median()
+        mean_mos = df.mos_spread_deg.median()
+    print("mean Ncells=%f" % mean_N)
+    print("mean mos_spread=%f (deg.)" % mean_mos)
+
+    if mean_mos > args.EtaMax or mean_mos < args.EtaMin:
+        temp = mean_mos
+        mean_mos = args.EtaMax if mean_mos > args.EtaMax else args.EtaMin
+        print("Estimated Eta=%f, setting it to %f" % (temp, mean_mos))
+    if mean_N > args.NabcMax or mean_N < args.NabcMin:
+        temp = mean_N
+        mean_N = args.NabcMax if mean_N > args.NabcMax else args.NabcMin
+        print("Estimated N=%f, setting it to %f" %(temp, mean_N))
+
+    phil = """\ninit {{
+      Nabc = [{n},{n},{n}]
+      eta_abc = [{m},{m},{m}]
+    }}\n""".format(n=round(mean_N,4), m=mean_mos)
+
+    if args.updatePhil is not None:
+        with open(args.updatePhil, "r+") as o:
+            s = o.read()
+            s += phil
+            o.write(s)
+    if args.plot:
+        df.hist(bins=100, log=True)
+        plt.show()
diff --git a/simtbx/command_line/hopper.py b/simtbx/command_line/hopper.py
index cfd0e8ac75..32f8216d90 100644
--- a/simtbx/command_line/hopper.py
+++ b/simtbx/command_line/hopper.py
@@ -96,6 +96,7 @@ def run(self):
         assert os.path.exists(self.params.exp_ref_spec_file)
         input_lines = None
         best_models = None
+        pd_dir = os.path.join(self.params.outdir, "pandas")
         if COMM.rank == 0:
             input_lines = open(self.params.exp_ref_spec_file, "r").readlines()
             if self.params.skip is not None:
@@ -113,6 +114,10 @@ def run(self):
                 if self.params.gathers_dir is None:
                     raise ValueError("Need to provide a file dir path in order to dump_gathers")
                 utils.safe_makedirs(self.params.gathers_dir)
+
+            utils.safe_makedirs(pd_dir)
+
+        COMM.barrier()
         input_lines = COMM.bcast(input_lines)
         best_models = COMM.bcast(best_models)
 
@@ -127,53 +132,63 @@ def run(self):
 
         exp_gatheredRef_spec = []  # optional list of expt, refls, spectra
         trefs = []
-        for i_exp, line in enumerate(input_lines):
-            if i_exp == self.params.max_process:
+        this_rank_dfs = []  # dataframes storing the modeling results for each shot
+        for i_shot, line in enumerate(input_lines):
+            if i_shot == self.params.max_process:
                 break
-            if i_exp % COMM.size != COMM.rank:
+            if i_shot % COMM.size != COMM.rank:
                 continue
 
-            logging.info("COMM.rank %d on shot  %d / %d" % (COMM.rank, i_exp + 1, len(input_lines)))
+            logging.info("COMM.rank %d on shot  %d / %d" % (COMM.rank, i_shot + 1, len(input_lines)))
             line_fields = line.strip().split()
-            assert len(line_fields) in [2, 3]
-            if len(line_fields) == 2:
-                exp, ref = line_fields
-                spec = None
-            else:
-                exp, ref, spec = line_fields
+            num_fields = len(line_fields)
+            assert num_fields in [2, 3, 4]
+            exp, ref = line_fields[:2]
+            spec = None
+            exp_idx = 0
+            if num_fields==3:
+                try:
+                    exp_idx = int(line_fields[2])
+                except ValueError:
+                    spec = line_fields[2]
+                    exp_idx = 0
+            elif num_fields==4:
+                assert os.path.isfile(line_fields[2])
+                spec = line_fields[2]
+                exp_idx = int(line_fields[3])
 
             if self.params.ignore_existing:
                 basename = os.path.splitext(os.path.basename(exp))[0]
                 exists = False
-                for ii in [i_exp, 0]:
-                    opt_exp = "%s_%s_%d.expt" % (self.params.tag, basename, ii)
+                for ii in [i_shot, 0]:
+                    opt_exp = "%s_%s_%d_%d.expt" % (self.params.tag, basename, exp_idx, ii)
                     opt_refl = opt_exp.replace(".expt", ".refl")
                     if opt_exp in exp_names_already and opt_refl in refl_names_already:
                         exists = True
                         break
                 if exists:
-                    print("Found existing!! %d" % i_exp)
+                    print("Found existing!! %d" % i_shot)
                     continue
 
             best = None
             if best_models is not None:
-                best = best_models.query("exp_name=='%s'" % exp)
-                if len(best) == 0:
-                    best = best_models.query("opt_exp_name=='%s'" % exp)
+                best = best_models.query("exp_name=='%s'" % exp).query("exp_idx==%d" % exp_idx)
                 if len(best) != 1:
                     raise ValueError("Should be 1 entry for exp %s in best pickle %s" % (exp, self.params.best_pickle))
             self.params.simulator.spectrum.filename = spec
             Modeler = hopper_utils.DataModeler(self.params)
             Modeler.exper_name = exp
+            Modeler.exper_idx = exp_idx
             Modeler.refl_name = ref
             Modeler.rank = COMM.rank
-            Modeler.i_exp = i_exp
+            Modeler.i_shot = i_shot
             if self.params.load_data_from_refls:
                 gathered = Modeler.GatherFromReflectionTable(exp, ref, sg_symbol=self.params.space_group)
             else:
                 gathered = Modeler.GatherFromExperiment(exp, ref,
                                                         remove_duplicate_hkl=self.params.remove_duplicate_hkl,
-                                                        sg_symbol=self.params.space_group)
+                                                        sg_symbol=self.params.space_group,
+                                                        exp_idx=exp_idx)
             if not gathered:
                 logging.warning("No refls in %s; CONTINUE; COMM.rank=%d" % (ref, COMM.rank))
                 continue
@@ -213,20 +228,24 @@ def run(self):
             # best pickle is not supported yet for multiple crystals
             # also, if number of crystals is >1 , then the params.number_of_xtals flag will be overridden
             exp_list = ExperimentListFactory.from_json_file(exp, False)
-            xtals = exp_list.crystals()
-            if len(xtals) > 1:
+            xtals = exp_list.crystals()  # TODO: fix as this is broken now that we allow multi image experiments
+            if self.params.consider_multicrystal_shots and len(xtals) > 1:
                 assert best is None, "cannot pass best pickle if expt list has more than one crystal"
                 assert self.params.number_of_xtals==1, "if expt list has more than one xtal, leave number_of_xtals as the default"
                 self.params.number_of_xtals = len(xtals)
                 MAIN_LOGGER.debug("Found %d xtals with unit cells:" %len(xtals))
                 for xtal in xtals:
                     MAIN_LOGGER.debug("%.4f %.4f %.4f %.4f %.4f %.4f" % xtal.get_unit_cell().parameters())
+            if self.params.record_device_timings and COMM.rank >0:
+                self.params.record_device_timings = False  # only record for rank 0 otherwise there's too much output
             SIM = hopper_utils.get_simulator_for_data_modelers(Modeler)
             Modeler.set_parameters_for_experiment(best)
-            Modeler.Umatrices = [xtal.get_U() for xtal in xtals]
-            # TODO, move this to SimulatorFromExperiment
+            Modeler.Umatrices = [Modeler.E.crystal.get_U()]
+
+            # TODO: move this to SimulatorFromExperiment
+            # TODO: fix multi crystal shot mode
             if best is not None and "other_spotscales" in list(best) and "other_Umats" in list(best):
-                Modeler.Umatrices[0] = self.E.get_U()
+                Modeler.Umatrices[0] = Modeler.E.get_U()
                 assert len(xtals) == len(best.other_spotscales.values[0])+1
                 for i_xtal in range(1, len(xtals),1):
                     scale_xt = best.other_spotscales.values[0][i_xtal]
@@ -250,22 +269,31 @@ def run(self):
                 nparam += SIM.Num_ASU*SIM.num_Fhkl_channels
             x0 = [1] * nparam
             tref = time.time()
-            MAIN_LOGGER.info("Beginning refinement of shot %d / %d" % (i_exp+1, len(input_lines)))
+            MAIN_LOGGER.info("Beginning refinement of shot %d / %d" % (i_shot+1, len(input_lines)))
             try:
-                x = Modeler.Minimize(x0, SIM, i_exp=i_exp)
+                x = Modeler.Minimize(x0, SIM, i_shot=i_shot)
+                for i_rep in range(self.params.filter_after_refinement.max_attempts):
+                    final_sigz = Modeler.target.all_sigZ[-1]
+                    niter = len(Modeler.target.all_sigZ)
+                    too_few_iter = niter < self.params.filter_after_refinement.min_prev_niter
+                    too_high_sigz = final_sigz > self.params.filter_after_refinement.max_prev_sigz
+                    if too_few_iter or too_high_sigz:
+                        Modeler.filter_pixels(self.params.filter_after_refinement.threshold)
+                        x = Modeler.Minimize(x0, SIM, i_shot=i_shot)
+
             except StopIteration:
                 x = Modeler.target.x0
             tref = time.time()-tref
             sigz = niter = None
             try:
                 niter = len(Modeler.target.all_hop_id)
-                sigz = np.mean(Modeler.target.all_sigZ)
+                sigz = Modeler.target.all_sigZ[-1]
             except Exception:
                 pass
 
             trefs.append(tref)
             print_s = "Finished refinement of shot %d / %d in %.4f sec. (rank mean t/im=%.4f sec.)" \
-                        % (i_exp+1, len(input_lines), tref, np.mean(trefs))
+                        % (i_shot+1, len(input_lines), tref, np.mean(trefs))
             if sigz is not None and niter is not None:
                 print_s += " Ran %d iterations. Final sigmaZ = %.1f," % (niter, sigz)
             if COMM.rank==0:
@@ -275,11 +303,18 @@ def run(self):
             if self.params.profile:
                 SIM.D.show_timings(COMM.rank)
 
-            Modeler.save_up(x,SIM, rank=COMM.rank, i_exp=i_exp)
+            dbg = self.params.debug_mode
+            shot_df = Modeler.save_up(x, SIM, rank=COMM.rank, i_shot=i_shot,
+                            save_fhkl_data=dbg, save_refl=dbg, save_modeler_file=dbg,
+                            save_sim_info=dbg, save_pandas=dbg, save_traces=dbg, save_expt=dbg)
+            this_rank_dfs.append(shot_df)
             if Modeler.params.refiner.debug_pixel_panelfastslow is not None:
                 # TODO separate diffBragg logger
                 utils.show_diffBragg_state(SIM.D, Modeler.params.refiner.debug_pixel_panelfastslow)
 
+            # TODO verify this works:
+            if SIM.D.record_timings:
+                SIM.D.show_timings(COMM.rank)
             Modeler.clean_up(SIM)
             del SIM.D  # TODO: is this necessary ?
 
@@ -294,6 +329,21 @@ def run(self):
                         o.write("%s %s\n" % (e,r))
                 o.close()
 
+        if this_rank_dfs:
+            this_rank_dfs = pandas.concat(this_rank_dfs).reset_index(drop=True)
+            df_name = os.path.join(pd_dir, "hopper_results_rank%d.pkl" % COMM.rank)
+            this_rank_dfs.to_pickle(df_name)
+
+        #MAIN_LOGGER.info("MPI-Gathering data frames across ranks")
+        #all_rank_dfs = COMM.gather(this_rank_dfs)
+        #if COMM.rank==0:
+        #    all_rank_dfs = pandas.concat(all_rank_dfs)
+        #    all_rank_dfs.reset_index(inplace=True, drop=True)
+        #    all_df_name = os.path.join(self.params.outdir, "hopper_results.pkl")
+        #    all_rank_dfs.to_pickle(all_df_name)
+
+
+
 
 if __name__ == '__main__':
     from dials.util import show_mail_on_error
@@ -311,6 +361,7 @@ def run(self):
             print("Install line_profiler in order to use logging: libtbx.python -m pip install line_profiler")
 
         with DeviceWrapper(script.dev) as _:
+            #with np.errstate(all='raise'):
             RUN()
 
         if lp is not None:
diff --git a/simtbx/command_line/hopper_ensemble.py b/simtbx/command_line/hopper_ensemble.py
index ca578a9214..49e0d2e449 100644
--- a/simtbx/command_line/hopper_ensemble.py
+++ b/simtbx/command_line/hopper_ensemble.py
@@ -7,7 +7,7 @@
 parser.add_argument("input", type=str, help="combined pandas pickle")
 parser.add_argument("phil", type=str, help="user phil file used to run hopper (see simtbx/diffBragg/phil.py)")
 parser.add_argument("--outdir", type=str, default=None, help="output folder")
-parser.add_argument("--exp", type=str, default="opt_exp_name", help="column name for input expeirments (default is opt_exp_name)")
+parser.add_argument("--exp", type=str, default="exp_name", help="column name for input expeirments (default is opt_exp_name)")
 parser.add_argument("--refl", type=str, default="stage2_refls", help="column name for refls (default is stage2_refls)")
 parser.add_argument("--cmdlinePhil", nargs="+", default=None, type=str, help="command line phil params")
 parser.add_argument("--cell", nargs=6, type=float, default=None, help="unit cell to use when writing MTZ files. If not provided, average will be used")
@@ -25,7 +25,7 @@
 
 from simtbx.diffBragg.hopper_ensemble_utils import load_inputs
 from libtbx.mpi4py import MPI
-
+from simtbx.diffBragg.device import DeviceWrapper
 
 COMM= MPI.COMM_WORLD
 LOGGER = logging.getLogger("diffBragg.main")
@@ -66,6 +66,8 @@ def write_commandline(params):
     if args.outdir is not None:
         params.outdir = args.outdir
     params.tag = args.saveTag
+    if params.record_device_timings and COMM.rank > 0:
+        params.record_device_timings = False  # only record for rank 0 otherwise there's too much output
     # end of phil stuff ========
 
     write_commandline(params)
@@ -79,8 +81,8 @@ def write_commandline(params):
 
     if params.skip is not None:
         df = df.iloc[params.skip:]
-    if params.first_n is not None:
-        df = df.iloc[:params.first_n]
+    if params.max_process is not None:
+        df = df.iloc[:params.max_process]
     df.reset_index(inplace=True, drop=True)
 
     gather_dir=None
@@ -91,16 +93,24 @@ def write_commandline(params):
             if not os.path.exists(gather_dir):
                 os.makedirs(gather_dir)
 
+    for col in [args.exp, args.refl]:
+        if col not in list(df):
+            raise KeyError("Col %s is missing from dataframe" % col)
+
     modelers = load_inputs(df, params, exper_key=args.exp, refls_key=args.refl, gather_dir=gather_dir)
     # note, we only go beyond this point if perImport flag was not passed
     modelers.cell_for_mtz = args.cell
     modelers.max_sigma = args.maxSigma
     modelers.outdir = args.outdir if args.outdir is not None else modelers.params.outdir
     modelers.save_freq = args.saveFreq
+
     modelers.prep_for_refinement()
-    modelers.save_modeler_params = args.saveAll
 
-    # do all sanity checks up front before minimization
-    modelers.Minimize(save=True)
+    with DeviceWrapper(modelers.SIM.D.device_Id) as _:
+        modelers.alloc_max_pix_per_shot()
+        modelers.save_modeler_params = args.saveAll
+
+        # do all sanity checks up front before minimization
+        modelers.Minimize(save=True)
 
-    LOGGER.debug("Done!")
+        LOGGER.debug("Done!")
diff --git a/simtbx/command_line/hopper_process.py b/simtbx/command_line/hopper_process.py
index f04d136b72..f170c89018 100644
--- a/simtbx/command_line/hopper_process.py
+++ b/simtbx/command_line/hopper_process.py
@@ -256,7 +256,7 @@ def integrate(self, experiments, indexed):
                     )
 
         if self.params.dispatch.coset:
-            from dials.algorithms.integration.sublattice_helper import integrate_coset
+            from xfel.util.sublattice_helper import integrate_coset
 
             integrate_coset(self, experiments, indexed)
 
diff --git a/simtbx/command_line/integrate.py b/simtbx/command_line/integrate.py
index c23e6011e9..e1d0cdd3c1 100644
--- a/simtbx/command_line/integrate.py
+++ b/simtbx/command_line/integrate.py
@@ -10,17 +10,29 @@
 parser.add_argument("outdir", type=str, help="path to output refls")
 
 parser.add_argument("--cmdlinePhil", nargs="+", default=None, type=str, help="command line phil params")
+parser.add_argument("--dialsInteg", action="store_true", help="Integrate new shoeboxes using dials and write *integrated.expt files")
 parser.add_argument("--numdev", type=int, default=1, help="number of GPUs (default=1)")
 parser.add_argument("--pklTag", type=str, help="optional suffix for globbing for pandas pickles (default .pkl)", default=".pkl")
 parser.add_argument("--loud", action="store_true", help="show lots of screen output")
 parser.add_argument("--hopInputName", default="preds_for_hopper", type=str, help="write exp_ref_spec file and best_pickle pointing to the preditction models, such that one can run predicted rois through simtbx.diffBragg.hopper (e.g. to fit per-roi scale factors)")
 parser.add_argument("--filterDupes", action="store_true", help="filter refls with same HKL")
+parser.add_argument("--keepShoeboxes", action="store_true", help="Optionally keep shoeboxes present in the prediction refl tables (can lead to OOM errors)")
+parser.add_argument("--scanWeakFracs", action="store_true", help="optionally stores a variety of inputs for stage2 based filtering different fractions of weak reflections")
 
 args = parser.parse_args()
 
 from mpi4py import MPI
 COMM = MPI.COMM_WORLD
 
+import logging
+if not args.loud:
+    logging.disable(logging.CRITICAL)
+else:
+    if COMM.rank==0:
+        logger = logging.getLogger("diffBragg.main")
+        logger.setLevel(logging.DEBUG)
+
+
 def printR(*args, **kwargs):
     print("RANK %d" % COMM.rank, *args, **kwargs)
 def print0(*args, **kwargs):
@@ -28,22 +40,40 @@ def print0(*args, **kwargs):
         print(*args, **kwargs)
 
 import numpy as np
-from simtbx.diffBragg import utils
+import json
+from simtbx.diffBragg import hopper_utils, utils
 from simtbx.modeling import predictions
 from simtbx.diffBragg.hopper_utils import downsamp_spec_from_params
 import glob
 import pandas
 import os
-import shutil
 from dials.algorithms.integration.stills_significance_filter import SignificanceFilter
 from dials.algorithms.indexing.stills_indexer import calc_2D_rmsd_and_displacements
-import logging
 import sys
 
-if not args.loud:
-    logging.disable(logging.CRITICAL)
-else:
-    logging.basicConfig(level=logging.DEBUG)
+
+def filter_weak_reflections(refls, weak_fraction):
+    """
+    :param pred:  reflection table created by this script
+    :param weak_fraction: number from 0-1 (if 0, only strong spots are saved)
+    :return: new reflection table with weak reflections filtered according to weak_fraction
+    """
+    new_refls = None
+    for idx in set(refls['id']):
+        pred = refls.select(refls['id']==idx)
+        weaks = pred.select(pred['is_weak'])
+        nweak = len(weaks)
+        weaks_sorted = np.argsort(weaks["scatter"])[::-1]
+        num_keep = int(nweak * weak_fraction)
+        weak_refl_inds_keep = set(np.array(weaks["refl_idx"])[weaks_sorted[:num_keep]])
+        weak_sel = flex.bool([i in weak_refl_inds_keep for i in pred['refl_idx']])
+        keeps = np.logical_or(pred['is_strong'], weak_sel)
+        pred = pred.select(flex.bool(keeps))
+        if new_refls is None:
+            new_refls = deepcopy(pred)
+        else:
+            new_refls.extend(pred)
+    return new_refls
 
 
 # Note: these imports and following 3 methods will eventually be in CCTBX/simtbx/diffBragg/utils
@@ -61,6 +91,7 @@ def print0(*args, **kwargs):
 from copy import deepcopy
 from collections import Counter
 
+from simtbx.diffBragg.device import DeviceWrapper
 
 def filter_refls(R):
     vec3_dbl_keys = 'xyzcal.px', 'xyzcal.mm', 'xyzobs.px.value', 'xyzobs.px.value', 'rlp', 's1'
@@ -338,148 +369,248 @@ def refls_from_sims(panel_imgs, detector, beam, thresh=0, filter=None, panel_ids
             os.makedirs(args.outdir)
     COMM.barrier()
 
+    #rank_outdir = os.path.join( args.outdir, "rank%d" % COMM.rank)
+    #if not os.path.exists(rank_outdir):
+    #    os.makedirs(rank_outdir)
+
     params = utils.get_extracted_params_from_phil_sources(args.predPhil, args.cmdlinePhil)
-    if os.path.isfile(args.inputGlob):
-        df_all = pandas.read_pickle(args.inputGlob)
-        df_all.reset_index(inplace=True, drop=True)
-        def df_iter():
-            for i_f in range(len(df_all)):
-                if i_f % COMM.size != COMM.rank:
-                    continue
-                df_i = df_all.iloc[i_f:i_f+1].copy().reset_index(drop=True)
-                yield i_f, df_i
-        Nf = len(df_all)
-    else:
-        if os.path.isdir(args.inputGlob):
-            glob_s = os.path.join(args.inputGlob, "pandas/rank*/*.pkl")
-            fnames = glob.glob(glob_s)
+
+    # inputGlob can be a glob in strings, a single pandas file, or a hopper output folder
+    if os.path.isfile(args.inputGlob) or os.path.isdir(args.inputGlob):
+        if os.path.isfile(args.inputGlob):
+            fnames = [args.inputGlob]
         else:
-            fnames = glob.glob(args.inputGlob)
-        def df_iter():
-            for i_f,f in enumerate(fnames):
-                if i_f % COMM.size != COMM.rank:
-                    continue
-                df = pandas.read_pickle(f)
-                yield i_f, df
-        Nf = len(fnames)
+            dirname = args.inputGlob
+            fnames = glob.glob( os.path.join(dirname, "pandas/hopper_results_rank*.pkl"))
+    else:
+        fnames = glob.glob(args.inputGlob)
+
+    if not fnames:
+        raise OSError("Found no filenames to load!")
+    Nf = 0
+    shots_per_df = []
+    print0("getting total number of shots")
+    for i_f, f in enumerate(fnames):
+        if i_f % COMM.size != COMM.rank:
+            continue
+        n = len(pandas.read_pickle(f))
+        shots_per_df += [(f, str(x)) for x in range(n)]  # note we cast to string because of mpi reduce
+        Nf += n
+
+    shots_per_df = COMM.bcast(COMM.reduce( shots_per_df))
+    Nf = COMM.bcast(COMM.reduce(Nf))
+    print0("total num shots is %d" % Nf)
+    df_rows_per_rank = np.array_split(shots_per_df, COMM.size)[COMM.rank]
+
+    print0("getting dataframe handles")
+    df_handles = {}
+    dfs = []
+    if df_rows_per_rank.size:
+        dfs, _ = zip(*df_rows_per_rank)
+    for f in set(dfs):
+        df_handles[f] = pandas.read_pickle(f).reset_index(drop=True)
+
+    def df_iter():
+        for i_df, (df_name, row_idx) in enumerate(df_rows_per_rank):
+            row_idx = int(row_idx)
+            printR("Opening shot %d / %d" % (i_df+1, len(df_rows_per_rank)))
+            df = df_handles[df_name].iloc[row_idx: row_idx+1].copy()
+            yield i_df, df
 
     if params.predictions.verbose:
         params.predictions.verbose = COMM.rank==0
 
     dev = COMM.rank % args.numdev
 
-    print0("Found %d input files" % Nf)
-
-    all_dfs = []
-    all_pred_names = []
-    exp_ref_spec_lines = []
-    for i_f, df in df_iter():
-        printR("Shot %d / %d" % (i_f+1, Nf), flush=True)
-
-        expt_name = df.opt_exp_name.values[0]
-        tag = os.path.splitext(os.path.basename(expt_name))[0]
-        new_expt_name = "%s/%s_%d_predicted.expt" % (args.outdir,tag,  i_f)
-        new_expt_name = os.path.abspath(new_expt_name)
-        df["opt_exp_name"] = new_expt_name
-
-        shutil.copyfile(expt_name,  new_expt_name)
-
-        data_exptList = ExperimentList.from_file(expt_name)
-        data_expt = data_exptList[0]
-
-        try:
-            spectrum_override = None
-            if params.spectrum_from_imageset:
-                spectrum_override = downsamp_spec_from_params(params, data_expt)
-            pred = predictions.get_predicted_from_pandas(
-                df, params, strong=None, device_Id=dev, spectrum_override=spectrum_override)
-            if args.filterDupes:
-                pred = filter_refls(pred)
-        except ValueError:
-            os.remove(new_expt_name)
-            continue
-
-        data = utils.image_data_from_expt(data_expt)
-        Rstrong = refls_from_sims(data, data_expt.detector, data_expt.beam, phil_file=args.procPhil )
-        Rstrong['id'] = flex.int(len(Rstrong), 0)
-        num_panels = len(data_expt.detector)
-        if num_panels > 1:
-            assert params.predictions.label_weak_col == "rlp"
-
-        Rstrong.centroid_px_to_mm(data_exptList)
-        Rstrong.map_centroids_to_reciprocal_space(data_exptList)
-        predictions.label_weak_predictions(pred, Rstrong, q_cutoff=params.predictions.qcut, col=params.predictions.label_weak_col )
-
-        pred['is_strong'] = flex.bool(np.logical_not(pred['is_weak']))
-        strong_sel = np.logical_not(pred['is_weak'])
-
-        pred["refl_idx"] = flex.int(np.arange(len(pred)))
-        weaks = pred.select(pred['is_weak'])
-        weaks_sorted = np.argsort(weaks["scatter"])[::-1]
-        nweak = len(weaks)
-        num_keep = int(nweak*params.predictions.weak_fraction)
-        weak_refl_inds_keep = set(np.array(weaks["refl_idx"])[weaks_sorted[:num_keep]])
-
-        weak_sel = flex.bool([i in weak_refl_inds_keep for i in pred['refl_idx']])
-        keeps = np.logical_or( pred['is_strong'], weak_sel)
-        #printR("Sum keeps=%d; num_strong=%d, num_kept_weak=%d" % (sum(keeps), sum(strong_sel), sum(weak_sel)))
-        pred = pred.select(flex.bool(keeps))
-        nstrong = np.sum(strong_sel)
-        printR("Will save %d refls (%d strong, %d weak)" % (len(pred), np.sum(strong_sel), np.sum(weak_sel)))
-        pred_file = os.path.abspath("%s/%s_%d_predicted.refl" % ( args.outdir, tag, i_f))
-        pred.as_file(pred_file)
-
-        Rindexed = Rstrong.select(Rstrong['indexed'])
-        if len(Rindexed)==0:
-            print("No strong indexed refls for shot %s" % new_expt_name)
-            continue
+    EXPT_DIRS = os.path.join(args.outdir, "expts_and_refls")
+    if COMM.rank==0:
+        utils.safe_makedirs(EXPT_DIRS)
 
-        utils.refls_to_hkl(Rindexed, data_expt.detector, data_expt.beam, data_expt.crystal, update_table=True)
-        try:
-            int_expt, int_refl = integrate(args.procPhil, data_exptList, Rindexed, pred)
-            int_expt_name = "%s/%s_%d_integrated.expt" % ( args.outdir,tag, i_f)
-            int_expt.as_file(int_expt_name)
-            int_refl['bbox'] = int_refl['shoebox'].bounding_boxes()
-            int_refl_name = int_expt_name.replace(".expt", ".refl")
-            int_refl.as_file(int_refl_name)
-        except RuntimeError as err:
-            print("Integration failed for %s because: '%s'" % (pred_file, str(err)))
-
-        all_dfs.append(df)
-        all_pred_names.append(pred_file)
-        spec_name = df.spectrum_filename.values[0]
-        if spec_name is None:
-            spec_name = ""
-        exp_ref_spec_lines.append("%s %s %s\n" % (new_expt_name, pred_file, spec_name))
-
-    if all_dfs:
-        all_dfs = pandas.concat(all_dfs)
-        all_dfs["predicted_refls"] = all_pred_names
-        all_dfs["predictions"] = all_pred_names
-    else:
-        all_dfs = None
+    if args.scanWeakFracs and params.predictions.weak_fraction != 1:
+        print("WARNING: overriding weak_fracion because of scanWeakFracs")
+        params.predictions.weak_fraction=1
 
-    all_dfs = COMM.gather(all_dfs)
-    exp_ref_spec_lines = COMM.reduce(exp_ref_spec_lines)
-    print0("\nReflections written to folder %s.\n" % args.outdir)
-    if COMM.rank==0:
-        hopper_input_name = os.path.abspath(os.path.join(args.outdir , "%s.txt" % args.hopInputName))
-        o = open(hopper_input_name, "w")
-        for l in exp_ref_spec_lines:
-            o.write(l)
-        o.close()
-        all_dfs = [df for df in all_dfs if df is not None]
-        if not all_dfs:
-            raise ValueError("No dataframes to concat: prediction/integration failed for all shots..")
-
-        all_dfs = pandas.concat([df for df in all_dfs if df is not None])
-        all_dfs.reset_index(inplace=True, drop=True)
-        best_pkl_name = os.path.abspath(os.path.join(args.outdir , "%s.pkl" % args.hopInputName))
-        all_dfs.to_pickle(best_pkl_name)
-        print("Wrote %s (best_pickle option for simtbx.diffBragg.hopper) and %s (exp_ref_spec option for simtbx.diffBragg.hopper). Use them to run the predictions through hopper. Use the centroid=cal option to specify the predictions" % (best_pkl_name, hopper_input_name))
-
-        with open(args.outdir +".exectution.txt", "w") as o:
-            o.write("integrate was run from folder: %s\n" % os.getcwd())
-            o.write("The command line input was:\n")
-            o.write(" ".join(sys.argv))
-            #TODO: write the diff phils here:
+    print0("Found %d input files" % Nf)
+    with DeviceWrapper(dev) as _:
+        all_dfs = []
+        all_pred_names = []
+        exp_ref_spec_lines = []
+        all_rank_pred = None
+        all_rank_expt = None
+
+        rank_shot_count = 0
+        rank_pred_file = os.path.join(EXPT_DIRS, "rank%d_preds.refl" % COMM.rank)
+        rank_pred_file = os.path.abspath(rank_pred_file)
+        rank_expt_file = rank_pred_file.replace(".refl", ".expt")
+        for i_f, df in df_iter():
+
+            expt_name = df.exp_name.values[0]
+            expt_idx = df.exp_idx.values[0]
+            tag = os.path.splitext(os.path.basename(expt_name))[0]
+
+            data_expt = hopper_utils.DataModeler.exper_json_single_file(expt_name, expt_idx)
+            data_exptList = ExperimentList()
+            data_exptList.append(data_expt)
+
+            try:
+                spectrum_override = None
+                if params.spectrum_from_imageset:
+                    spectrum_override = downsamp_spec_from_params(params, data_expt)
+                pred = predictions.get_predicted_from_pandas(
+                    df, params, strong=None, device_Id=dev, spectrum_override=spectrum_override)
+                if args.filterDupes:
+                    pred = filter_refls(pred)
+            except ValueError:
+                #os.remove(new_expt_name)
+                continue
+
+            data = utils.image_data_from_expt(data_expt)
+            Rstrong = refls_from_sims(data, data_expt.detector, data_expt.beam, phil_file=args.procPhil )
+            Rstrong['id'] = flex.int(len(Rstrong), 0)
+            num_panels = len(data_expt.detector)
+            if num_panels > 1:
+                assert params.predictions.label_weak_col == "rlp"
+
+            Rstrong.centroid_px_to_mm(data_exptList)
+            Rstrong.map_centroids_to_reciprocal_space(data_exptList)
+            predictions.label_weak_predictions(pred, Rstrong, q_cutoff=params.predictions.qcut, col=params.predictions.label_weak_col )
+
+            pred['is_strong'] = flex.bool(np.logical_not(pred['is_weak']))
+            strong_sel = np.logical_not(pred['is_weak'])
+
+            pred["refl_idx"] = flex.int(np.arange(len(pred)))
+
+            #weaks = pred.select(pred['is_weak'])
+            #weaks_sorted = np.argsort(weaks["scatter"])[::-1]
+            #nweak = len(weaks)
+            #num_keep = int(nweak*params.predictions.weak_fraction)
+            #weak_refl_inds_keep = set(np.array(weaks["refl_idx"])[weaks_sorted[:num_keep]])
+            #weak_sel = flex.bool([i in weak_refl_inds_keep for i in pred['refl_idx']])
+            #keeps = np.logical_or( pred['is_strong'], weak_sel)
+            #printR("Sum keeps=%d; num_strong=%d, num_kept_weak=%d" % (sum(keeps), sum(strong_sel), sum(weak_sel)))
+            #pred = pred.select(flex.bool(keeps))
+            pred = filter_weak_reflections(pred, weak_fraction=params.predictions.weak_fraction)
+
+            nstrong = np.sum(strong_sel)
+            printR("Will save %d refls (%d strong, %d weak)" % (len(pred), np.sum(pred["is_strong"]), np.sum(pred["is_weak"])))
+            pred['id'] = flex.int(len(pred), rank_shot_count)
+            if 'shoebox' in list(pred) and not args.keepShoeboxes:
+                del pred['shoebox']
+            if all_rank_pred is None:
+                all_rank_pred = deepcopy(pred)
+            else:
+                all_rank_pred.extend(pred)
+
+            # Note, the simple append causes memory leak:
+            #all_rank_expt.append(data_expt)
+            if all_rank_expt is None:
+                all_rank_expt = deepcopy(data_exptList.to_dict())
+            else:
+                Edict = data_exptList.to_dict()
+                for exp_key in 'beam', 'detector', 'crystal', 'imageset':
+                    Edict['experiment'][0][exp_key] = rank_shot_count
+                for exp_key in 'experiment', 'beam', 'detector', 'crystal', 'imageset':
+                    assert len( Edict[exp_key])==1
+                    all_rank_expt[exp_key] .append(Edict[exp_key][0])
+
+            Rindexed = Rstrong.select(Rstrong['indexed'])
+            if len(Rindexed)==0:
+                print("No strong indexed refls for shot %s" % expt_name)
+                continue
+
+            utils.refls_to_hkl(Rindexed, data_expt.detector, data_expt.beam, data_expt.crystal, update_table=True)
+            if args.dialsInteg:
+                # TODO: save these files as multi-shot experiment/refls
+                try:
+                    int_expt, int_refl = integrate(args.procPhil, data_exptList, Rindexed, pred)
+                    int_expt_name = "%s/%s_%d_integrated.expt" % (rank_outdir, tag, i_f)
+                    int_expt.as_file(int_expt_name)
+                    int_refl['bbox'] = int_refl['shoebox'].bounding_boxes()
+                    int_refl_name = int_expt_name.replace(".expt", ".refl")
+                    int_refl.as_file(int_refl_name)
+                except RuntimeError:
+                    print("Integration failed" )
+
+            df['old_exp_name'] = expt_name
+            df['old_exp_idx'] = expt_idx
+            df['exp_name'] = rank_expt_file
+            df['exp_idx'] = rank_shot_count
+
+            df['predictions'] = rank_pred_file
+            df['predicted_refs'] = rank_pred_file
+            df['num_pred'] = len(pred)
+
+            all_dfs.append(df)
+            rank_shot_count += 1
+
+            spec_name = df.spectrum_filename.values[0]
+            if spec_name is None:
+                spec_name = ""
+            exp_ref_spec_lines.append("%s %s %s %d\n" % (rank_expt_file, rank_pred_file, spec_name, rank_shot_count))
+
+        all_rank_pred.as_file(rank_pred_file)
+        # NOTE: all_rank_expt is a dictionary to avoid weird OOM, so we write a simple json
+        #all_rank_expt.as_file(rank_expt_file)
+        with open(rank_expt_file, "w") as file_O:
+            json.dump(all_rank_expt, file_O)
+        print0("Done with predictions, combining dataframes")
+        if all_dfs:
+            all_dfs = pandas.concat(all_dfs)
+        else:
+            all_dfs = None
+
+        if args.scanWeakFracs and all_dfs is not None:
+            assert len(all_dfs.predictions.unique()) == 1
+            pred_file = all_dfs.predictions.values[0]
+            n_total_weak = np.sum(all_rank_pred['is_weak'])
+            n_total = len(all_rank_pred)
+            weak_fracs = [.11,.22,.33,.44,.55,.66,.77,.88]
+            labels = []
+            for i_frac, weak_frac in enumerate(weak_fracs):
+                filt_refls = filter_weak_reflections(all_rank_pred, weak_frac)
+                label="%dperc"%(weak_frac*100,)
+                labels.append(label)
+                new_pred_file = os.path.splitext(pred_file)[0]+"_%s.refl" % label
+                all_dfs['predictions_%s' % label] = new_pred_file
+                all_dfs['predicted_refs_%s' %label] = new_pred_file
+                num_preds = []
+                for exp_id in all_dfs.exp_idx.values:
+                    n = np.sum(filt_refls['id'] == int(exp_id))
+                    num_preds.append(n)
+                all_dfs['num_pred_%s' %label] = num_preds
+                filt_refls.as_file(new_pred_file)
+                printR("Saved %d/%d refls (%d strong, %d/%d weak) to %s"
+                       % (len(filt_refls), n_total, np.sum(filt_refls["is_strong"]), np.sum(filt_refls["is_weak"]), n_total_weak, new_pred_file))
+            # note this sanity check below requires that weaK_fracs be sorted
+            if sorted(weak_fracs) == weak_fracs:
+                # then as weak frac increases, there should be an increasing number of predictions
+                num_preds_per_frac = [all_dfs["num_pred_%s" % lab].sum() for lab in labels]
+                assert num_preds_per_frac == sorted(num_preds_per_frac)
+
+        print0("MPI gather all_dfs")
+        all_dfs = COMM.gather(all_dfs)
+        print0("MPI reduce lines")
+        exp_ref_spec_lines = COMM.reduce(exp_ref_spec_lines)
+        if COMM.rank==0:
+            hopper_input_name = os.path.abspath(os.path.join(args.outdir , "%s.txt" % args.hopInputName))
+            o = open(hopper_input_name, "w")
+            for l in exp_ref_spec_lines:
+                o.write(l)
+            o.close()
+            all_dfs = [df for df in all_dfs if df is not None]
+            if not all_dfs:
+                raise ValueError("No dataframes to concat: prediction/integration failed for all shots..")
+
+            print("Concat frames")
+            all_dfs = pandas.concat([df for df in all_dfs if df is not None])
+            all_dfs.reset_index(inplace=True, drop=True)
+            best_pkl_name = os.path.abspath(os.path.join(args.outdir , "%s.pkl" % args.hopInputName))
+            all_dfs.to_pickle(best_pkl_name)
+            print("Wrote %s (best_pickle option for simtbx.diffBragg.hopper) and %s (exp_ref_spec option for simtbx.diffBragg.hopper). Use them to run the predictions through hopper (use phil centroid=cal) or simtbx.diffBragg.stage_two." % (best_pkl_name, hopper_input_name))
+
+            cmd_log_file = os.path.join(args.outdir, "cmdline_execution.txt")
+            with open(cmd_log_file, "w") as o:
+                o.write("integrate was run from folder: %s\n" % os.getcwd())
+                o.write("The command line input was:\n")
+                o.write(" ".join(sys.argv) + "\n")
+                #TODO: write the diff phils here:
diff --git a/simtbx/command_line/make_input_file.py b/simtbx/command_line/make_input_file.py
index b63b9c4ac5..3229cd1b53 100644
--- a/simtbx/command_line/make_input_file.py
+++ b/simtbx/command_line/make_input_file.py
@@ -11,6 +11,7 @@
                                                                "split files will be written in same folders as their sources")
 parser.add_argument("--exptSuffix", type=str, default="refined.expt", help="find experiments with this suffix")
 parser.add_argument("--reflSuffix", type=str, default="indexed.refl", help="find reflection files with this suffix")
+parser.add_argument("--write", action="store_true")
 
 
 args = parser.parse_args()
@@ -25,10 +26,11 @@
 from simtbx.diffBragg import hopper_io
 import hashlib
 
+
 from libtbx.mpi4py import MPI
 COMM = MPI.COMM_WORLD
 
-if COMM.rank==0:
+if COMM.rank==0 and args.write:
     if args.splitDir is not None and not os.path.exists(args.splitDir):
         os.makedirs(args.splitDir)
 
@@ -48,7 +50,7 @@ def get_idx_path(El):
     return idx, path
 
 
-def split_stills_expts(expt_f, refl_f, split_dir):
+def split_stills_expts(expt_f, refl_f, split_dir, write=False):
     El = ExperimentList.from_file(expt_f, False)
     R = flex.reflection_table.from_file(refl_f)
     expt_names = []
@@ -65,15 +67,16 @@ def split_stills_expts(expt_f, refl_f, split_dir):
             seen_isets[iset_id] += 1
         tag = "%s-%d" % (os.path.basename(os.path.splitext(path)[0]), idx)
         new_expt_name = os.path.splitext(expt_f)[0] + "_%s_xtal%d.expt" % (tag, seen_isets[iset_id])
-        if split_dir is not None:
+        if write and split_dir is not None:
             unique_tag = "shot_%s" % hash_name(new_expt_name) + ".expt"
             new_expt_name = os.path.join(split_dir, unique_tag)
         new_refl_name = new_expt_name.replace(".expt", ".refl")
         refls = R.select(R['id'] == i_expt)
         refls.reset_ids()
 
-        one_exp_El.as_file(new_expt_name)
-        refls.as_file(new_refl_name)
+        if write:
+            one_exp_El.as_file(new_expt_name)
+            refls.as_file(new_refl_name)
         expt_names.append(new_expt_name)
         refl_names.append(new_refl_name)
         orig_expt_names.append((apath(new_expt_name), (apath(expt_f), i_expt)))
@@ -119,12 +122,20 @@ def split_stills_expts(expt_f, refl_f, split_dir):
 
 if COMM.rank==0:
 
-    print("Saving the input file for diffBragg")
-    hopper_io.save_expt_refl_file(args.filename, exp_names, ref_names, check_exists=True)
-    print("Saved %s" % args.filename)
-
-    jname = args.filename + ".json"
-    jdat = {"expt": dict(orig_exp_names), "refl": dict(orig_ref_names)}
-    with open(jname, "w") as fp:
-        json.dump(jdat,  fp, indent=1)
-    print("Wrote json %s, which maps the hashnames to the original expt files" % jname)
+    if args.write:
+        print("Saving the input file for diffBragg")
+        hopper_io.save_expt_refl_file(args.filename, exp_names, ref_names, check_exists=True)
+        print("Saved %s" % args.filename)
+        jname = args.filename + ".json"
+        jdat = {"expt": dict(orig_exp_names), "refl": dict(orig_ref_names)}
+        with open(jname, "w") as fp:
+            json.dump(jdat, fp, indent=1)
+        print("Wrote json %s, which maps the hashnames to the original expt files" % jname)
+    else:
+        _, exp_and_idx = zip(*orig_exp_names)
+        _, ref_and_idx = zip(*orig_ref_names)
+        exp, exp_idx = zip(*exp_and_idx)
+        ref, ref_idx = zip(*ref_and_idx)
+        assert exp_idx == ref_idx
+        hopper_io.save_expt_refl_file(args.filename, exp, ref, check_exists=True,
+                                      indices=exp_idx)
diff --git a/simtbx/command_line/spectra.py b/simtbx/command_line/spectra.py
new file mode 100644
index 0000000000..0ccfa78791
--- /dev/null
+++ b/simtbx/command_line/spectra.py
@@ -0,0 +1,99 @@
+from __future__ import division
+
+# LIBTBX_SET_DISPATCHER_NAME diffBragg.spectra
+
+#TODO: add text entry boxes for other spectrum filter params, position entry boxes sensibly
+
+from simtbx.diffBragg import hopper_utils
+from pylab import *
+import dxtbx
+from simtbx.diffBragg.phil import philz, hopper_phil
+from libtbx.phil import parse
+from matplotlib.widgets import TextBox
+
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("image_file", type=str, help="path to a diffBragg modeler file (output from hopper, see the imgs folder in the outdir)")
+parser.add_argument("--filt_freq", default=0.07, type=float)
+parser.add_argument("--filt_order", default=3, type=float)
+parser.add_argument("--tail", default=50, type=int)
+parser.add_argument("--delta_en", default=0.5, type=float)
+parser.add_argument("--skip", action="store_true")
+args = parser.parse_args()
+
+FIG,ax0 = subplots(nrows=1,ncols=1)
+FIG.set_size_inches((5,3))
+
+class P:
+
+    def __init__(self, params, imgset, ax0, FIG):
+        self.params = params
+        self.imgset = imgset
+        self.ax0 = ax0
+        self.FIG= FIG
+
+    def entry(self, text):
+        delta_en = float(text)
+        self.params.downsamp_spec.delta_en = delta_en
+        print(delta_en)
+        self.update_plot(self.imgset)
+
+    def update_plot(self, i_img):
+        raw_spec = self.imgset.get_spectrum(i_img)
+        raw_en = raw_spec.get_energies_eV()
+        raw_wt = raw_spec.get_weights()
+
+        spec = hopper_utils.downsamp_spec_from_params(self.params, imgset=self.imgset, i_img=i_img)
+        en, wt = map(np.array, zip(*spec))
+        en = hopper_utils.utils.ENERGY_CONV / en
+
+        self.ax0.clear()
+        self.ax0.plot( raw_en, raw_wt, lw=2, label="raw spec (%d chan)" % len(raw_en))
+        self.ax0.plot( en, wt, '--', lw=1, label="filt spec (%d chan)" % len(en))
+        self.FIG.suptitle("image %d" % (i_img, ), fontsize=14)
+        self.ax0.set_xlabel("Energy (eV)", fontsize=12)
+        self.ax0.tick_params(labelsize=11)
+        self.ax0.legend(prop={"size":11}, loc="upper right")
+        draw()
+
+
+def press(event):
+    if event.key == 'right':
+        FIG.loop_counter += 1
+    elif event.key=="left":
+        FIG.loop_counter = FIG.loop_counter -1
+    FIG.loop_counter = max(FIG.loop_counter,0)
+    FIG.loop_counter = min(FIG.loop_counter, FIG.nspots-1)
+
+    if event.key=="escape":
+        FIG.loop_counter = FIG.nspots
+
+FIG.loop_counter = 0
+loader = dxtbx.load(args.image_file)
+imgset = loader.get_imageset(loader.get_image_file())
+FIG.nspots = len(imgset)
+FIG.canvas.mpl_connect('key_press_event', press)
+
+phil_scope = parse(philz + hopper_phil)
+params = phil_scope.fetch(sources=[phil_scope]).extract()
+
+params.downsamp_spec.filt_freq = args.filt_freq
+params.downsamp_spec.filt_order = args.filt_order
+params.downsamp_spec.tail = args.tail
+params.downsamp_spec.delta_en = args.delta_en
+params.downsamp_spec.skip = args.skip
+
+Pinst = P(params, imgset, ax0, FIG)
+axbox = plt.axes([0.25, 0.75, 0.1, 0.075])
+text_box = TextBox(axbox, 'delta_en', initial="%.2f" % args.delta_en)
+text_box.on_submit(Pinst.entry)
+
+while FIG.loop_counter < len(imgset):
+    i_img = FIG.loop_counter
+
+    Pinst.update_plot(i_img)
+
+    waitforbuttonpress()
+    i_img += 1
+
+plt.close()
diff --git a/simtbx/command_line/stage_two.py b/simtbx/command_line/stage_two.py
index bb72c9a541..0787a9867f 100644
--- a/simtbx/command_line/stage_two.py
+++ b/simtbx/command_line/stage_two.py
@@ -3,13 +3,13 @@
 # LIBTBX_SET_DISPATCHER_NAME simtbx.diffBragg.stage_two
 
 from libtbx.mpi4py import MPI
-from simtbx.kokkos import gpu_instance
-kokkos_run = gpu_instance(deviceId = 0)
 from simtbx.command_line.hopper import hopper_phil
 import time
 import logging
 from simtbx.diffBragg import mpi_logger
 
+from simtbx.diffBragg.device import DeviceWrapper
+
 COMM = MPI.COMM_WORLD
 
 if COMM.rank > 0:
@@ -28,10 +28,13 @@
 pandas_table = None
   .type = str
   .help = path to an input pandas table (usually output by simtbx.diffBragg.predictions)
-prep_time = 60
+refls_key = predictions
+  .type = str
+  .help = name of the predicted refls column in the pandas table input
+max_sigz = 10.
   .type = float
-  .help = Time spent optimizing order of input dataframe to better divide shots across ranks
-  .help = Unit is seconds, 1-2 minutes of prep might save a lot of time during refinement!
+  .help = Maximum allowed value ot sigz in the input pandas table  (dataframe)
+  .help = (high sigz above 10 usually indicates failed stage 1 refinement)
 """
 
 philz = script_phil + philz + hopper_phil
@@ -85,6 +88,7 @@ def run(self):
         if LineProfiler is not None and script.params.profile:
             lp = LineProfiler()
             lp.add_function(ensemble_refine_launcher.RefineLauncher.launch_refiner)
+            lp.add_function(ensemble_refine_launcher.RefineLauncher.load_inputs)
             lp.add_function(stage_two_refiner.StageTwoRefiner._compute_functional_and_gradients)
             lp.add_function(stage_two_refiner.StageTwoRefiner._run_diffBragg_current)
             lp.add_function(stage_two_refiner.StageTwoRefiner._update_Fcell)
@@ -108,7 +112,9 @@ def run(self):
         else:
             mpi_logger.setup_logging_from_params(script.params)
 
-        RUN()
+        dev = COMM.rank % script.params.refiner.num_devices
+        with DeviceWrapper(dev) as _:
+            RUN()
 
         if lp is not None:
             stats = lp.get_stats()
@@ -116,4 +122,4 @@ def run(self):
             hopper_utils.print_profile(stats,
                     ["launch_refiner", "_compute_functional_and_gradients", "_run_diffBragg_current",
                      "_update_Fcell", "_scale_pixel_data", "_Fcell_derivatives", "_mpi_aggregation",
-                     "GatherFromExperiment", "_setup"])
+                     "GatherFromExperiment", "_setup", "load_inputs"])
diff --git a/simtbx/command_line/update_stage1_phil.py b/simtbx/command_line/update_stage1_phil.py
new file mode 100644
index 0000000000..869d1fe32f
--- /dev/null
+++ b/simtbx/command_line/update_stage1_phil.py
@@ -0,0 +1,82 @@
+
+from __future__ import division
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("dirname", help="stage1 output folder with pandas and diff.phil", type=str)
+parser.add_argument("newPhil", default=None, help="new stage 1 phil file", type=str)
+parser.add_argument("--setGlim", action="store_true", help="Use unrestrained stage1 to set bounds on G")
+#parser.add_argument("--njobs", type=int, default=5, help="number of jobs (only runs on single node, no MPI)")
+#parser.add_argument("--plot", action="store_true", help="show a histogram at the end")
+args = parser.parse_args()
+
+# LIBTBX_SET_DISPATCHER_NAME diffBragg.update_stage1_phil
+
+import os
+import pandas
+import numpy as np
+import glob
+
+glob_s = os.path.join(args.dirname, "pandas/*pkl")
+fnames = glob.glob(glob_s)
+
+# read in the pandas pickles
+df1 = pandas.concat( [pandas.read_pickle(f) for f in fnames]).reset_index(drop=True)
+
+# unit cell phil
+a,b,c,al,be,ga = df1[['a', 'b', 'c', 'al', 'be', 'ga']].median()
+
+# Ncells abc and Nvol
+na, nb, nc = np.vstack(df1.ncells).T
+nvol = na*nb*nc
+nvol = np.median(na*nb*nc)
+na, nb, nc = map(np.median, (na, nb, nc))
+
+# eta
+ea, eb, ec = map( np.median, np.vstack(df1.eta_abc).T)
+
+# spot scale (G)
+Gmed = df1.spot_scales.median()
+Gmin = df1.spot_scales.min()/100
+Gmax = df1.spot_scales.max()*100
+
+update_phil = """
+init {{
+  G = {G}
+  Nabc = [{na},{nb},{nc}]
+  eta_abc = [{ea},{eb},{ec}]
+}}
+centers {{
+  Nvol = {nvol}
+  ucell_a = {a}
+  ucell_b = {b}
+  ucell_c = {c}
+  ucell_alpha = {al}
+  ucell_beta = {be}
+  ucell_gamma = {ga}
+}}
+betas {{
+  Nvol = 1e-2
+  ucell_a = 1e-7
+  ucell_b = 1e-7
+  ucell_c = 1e-7
+  ucell_alpha = 1e-7
+  ucell_beta = 1e-7
+  ucell_gamma = 1e-7
+}}
+use_restraints = True
+""".format(G=Gmed,na=na, nb=nb, nc=nb, ea=ea, eb=eb, ec=ec,a=a,b=b,c=c,al=al,be=be,ga=ga, nvol=nvol)
+
+Gmin_Gmax="""
+mins.G={Gmin}
+maxs.G={Gmax}\n""".format(Gmin=Gmin, Gmax=Gmax)
+
+if args.setGlim:
+    update_phil += Gmin_Gmax
+
+diff_phil_name = os.path.join(args.dirname, "diff.phil")
+assert os.path.exists(diff_phil_name)
+diff_phil = open(diff_phil_name, "r").read()
+with open(args.newPhil, "w") as o:
+    o.write(diff_phil + "\n"+ update_phil)
+
+print("Done. added \n%s\n to the %s and saved to %s" % (update_phil, diff_phil_name, args.newPhil) )
diff --git a/simtbx/diffBragg/ensemble_refine_launcher.py b/simtbx/diffBragg/ensemble_refine_launcher.py
index bb43493782..e84f5c3e9b 100644
--- a/simtbx/diffBragg/ensemble_refine_launcher.py
+++ b/simtbx/diffBragg/ensemble_refine_launcher.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, print_function
 from simtbx.diffBragg.stage_two_utils import PAR_from_params
+from collections import Counter
+from itertools import chain
 import os
 import sys
 from libtbx.mpi4py import MPI
@@ -11,11 +13,10 @@
 except ImportError:
     print("Pandas is required. Install using 'libtbx.python -m pip install pandas'")
     exit()
-from xfel.merging.application.utils.memory_usage import get_memory_usage
 from simtbx.diffBragg.refiners.stage_two_refiner import StageTwoRefiner
 from simtbx.diffBragg import utils
 from simtbx.diffBragg import hopper_utils
-from dxtbx.model.experiment_list import ExperimentListFactory
+from dxtbx.model.experiment_list import ExperimentList, ExperimentListFactory
 from simtbx.diffBragg.prep_stage2_input import prep_dataframe
 from cctbx import miller, crystal
 import logging
@@ -28,11 +29,20 @@ def global_refiner_from_parameters(params):
     # TODO read on each rank, or read and broadcast ?
     LOGGER.info("EVENT: read input pickle")
     pandas_table = pandas.read_pickle(params.pandas_table)
+    if params.max_sigz is not None and "sigz" in list(pandas_table):
+        Nframe = len(pandas_table)
+        pandas_table = pandas_table.query("sigz < %f" % params.max_sigz)
+        pandas_table.reset_index(drop=True, inplace=True)
+        LOGGER.info("Removed %d / %d dataframes due to max_sigz=%.2f filter"
+                    % (Nframe - len(pandas_table), Nframe, params.max_sigz))
+    if params.max_process > 0:
+        pandas_table = pandas_table.iloc[:params.max_process]
     LOGGER.info("EVENT: BEGIN prep dataframe")
-    if params.prep_time > 0:
-        work_distribution = prep_dataframe(pandas_table)
+    if "exp_idx" not in list(pandas_table):
+        pandas_table["exp_idx"] = 0
+    work_distribution = prep_dataframe(pandas_table, res_ranges_string=params.refiner.res_ranges)
     LOGGER.info("EVENT: DONE prep dataframe")
-    return launcher.launch_refiner(pandas_table, work_distribution=work_distribution)
+    return launcher.launch_refiner(pandas_table, work_distribution=work_distribution, refls_key=params.refls_key)
 
 
 class RefineLauncher:
@@ -63,7 +73,7 @@ def check_parameter_integrity(params):
             raise ValueError("Cannot refine because params.refiner.max_calls is empty")
 
         if os.environ.get("DIFFBRAGG_CUDA") is not None:
-            params.refiner.use_cuda = True
+            params.refiner.use_gpu = True
 
         return params
 
@@ -115,8 +125,8 @@ def _check_experiment_integrity(expt):
             if not hasattr(expt, model):
                 raise ValueError("No %s in experiment, exiting. " % model)
 
-    def launch_refiner(self, pandas_table, miller_data=None, work_distribution=None):
-        self.load_inputs(pandas_table, miller_data=miller_data, work_distribution=work_distribution)
+    def launch_refiner(self, pandas_table, miller_data=None, work_distribution=None, refls_key="predictions"):
+        self.load_inputs(pandas_table, miller_data=miller_data, work_distribution=work_distribution, refls_key=refls_key)
         LOGGER.info("EVENT: launch refiner")
         self._launch()
         return self.RUC
@@ -128,7 +138,7 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
             the pandas table is expected to have been written by diffBragg.hopper or
             diffBragg.hopper_process . See method save_to_pandas in simtbx/command_line/hopper.py
             For example, if the outputdir of diffBragg.hopper was set to `all_shots`, then
-            there should be a golder all_shots/pandas created which contains all of the per-shot pandas
+            there should be a folder all_shots/pandas created which contains all of the per-shot pandas
             dataframes. They should be concatenated as follows, forming a suitable argument for this method
             >> import glob,pandas
             >> fnames = glob.glob("all_shots/pandas/rank*/*pkl")
@@ -145,6 +155,8 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
         """
         COMM.Barrier()
         num_exp = len(pandas_table)
+        if "exp_idx" not in list(pandas_table):
+            pandas_table["exp_idx"] = 0
         first_exper_file = pandas_table.exp_name.values[0]
         detector = ExperimentListFactory.from_json_file(first_exper_file, check_format=False)[0].detector
         if detector is None and self.params.refiner.reference_geom is None:
@@ -169,51 +181,58 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
         shot_idx = 0  # each rank keeps index of the shots local to it
         rank_panel_groups_refined = set()
         exper_names = pandas_table.exp_name
-        assert len(exper_names) == len(set(exper_names))
+        exper_ids = pandas_table.exp_idx.values
+        shot_ids = list(zip(exper_names, exper_ids))
+        assert len(shot_ids) == len(set(shot_ids))
         # TODO assert all exper are single-file, probably way before this point
         if work_distribution is None:
             worklist = range(COMM.rank, len(exper_names), COMM.size)
         else:
             worklist = work_distribution[COMM.rank]
         LOGGER.info("EVENT: begin loading inputs")
-        for i_exp in worklist:
-            exper_name = exper_names[i_exp]
+
+        # load the Fhkl model once here to check which hkl are missing (and filter from the refls below)
+        first_exper = ExperimentList.from_file(exper_names[0], check_format=False)[0]
+        Fhkl_model = utils.load_Fhkl_model_from_params_and_expt(self.params, first_exper)
+        self.symbol = Fhkl_model.space_group().info().type().lookup_symbol()
+        if self.params.refiner.force_symbol is not None:
+            self.symbol = self.params.refiner.force_symbol
+        LOGGER.info("Will use space group symbol %s" % self.symbol)
+        Fhkl_model_p1 = Fhkl_model.expand_to_p1().generate_bijvoet_mates()
+        Fhkl_model_p1_indices = set(Fhkl_model_p1.indices())
+
+        for i_work, i_df in enumerate(worklist):
+            exper_name = exper_names[i_df]
+            exper_id = int(exper_ids[i_df])
             LOGGER.info("EVENT: BEGIN loading experiment list")
-            expt_list = ExperimentListFactory.from_json_file(exper_name, check_format=self.params.refiner.check_expt_format)
+            # TODO: test that the diffBragg_benchmarks is not broken
+            expt = hopper_utils.DataModeler.exper_json_single_file(exper_name, exper_id)
+            expt_list = ExperimentList()
+            expt_list.append(expt)
             LOGGER.info("EVENT: DONE loading experiment list")
-            if len(expt_list) != 1:
-                print("Input experiments need to have length 1, %s does not" % exper_name)
-            expt = expt_list[0]
             expt.detector = detector  # in case of supplied ref geom
             self._check_experiment_integrity(expt)
 
-            exper_dataframe = pandas_table.query("exp_name=='%s'" % exper_name)
+            exper_dataframe = pandas_table.query("exp_name=='%s'" % exper_name).query("exp_idx==%d" % exper_id)
 
             refl_name = exper_dataframe[refls_key].values[0]
             refls = flex.reflection_table.from_file(refl_name)
-            # FIXME need to remove (0,0,0) bboxes
+            refls = refls.select(refls['id'] == exper_id)
 
             try:
-                good_sel = flex.bool([h != (0, 0, 0) for h in list(refls["miller_index"])])
-                refls = refls.select(good_sel)
+                miller_inds = list(refls["miller_index"])
+                is_not_000 = [h != (0, 0, 0) for h in miller_inds]
+                is_in_Fhkl_model = [h in Fhkl_model_p1_indices for h in miller_inds]
+                LOGGER.debug("Only refining %d/%d refls whose HKL are in structure factor model" % (
+                    np.sum(is_in_Fhkl_model), len(refls)))
+                refl_sel = flex.bool(np.logical_and(is_not_000, is_in_Fhkl_model))
+                refls = refls.select(refl_sel)
             except KeyError:
                 pass
 
-            #UcellMan = utils.manager_from_crystal(expt.crystal)
             opt_uc_param = exper_dataframe[["a","b","c","al","be","ga"]].values[0]
             UcellMan = utils.manager_from_params(opt_uc_param)
 
-            if self.symbol is None:
-                if self.params.refiner.force_symbol is not None:
-                    self.symbol = self.params.refiner.force_symbol
-                else:
-                    self.symbol = expt.crystal.get_space_group().type().lookup_symbol()
-                LOGGER.info("Set space group symbol: %s" % self.symbol)
-            else:
-                if self.params.refiner.force_symbol is None:
-                    if expt.crystal.get_space_group().type().lookup_symbol() != self.symbol:
-                        raise ValueError("Crystals should all have the same space group symmetry")
-
             if shot_idx == 0:  # each rank initializes a simulator only once
                 if self.params.simulator.init_scale != 1:
                     print("WARNING: For stage_two , it is assumed that total scale is stored in the pandas dataframe")
@@ -226,31 +245,16 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
                     self.Fref = utils.open_mtz(self.params.refiner.stage_two.Fref_mtzname,
                                                self.params.refiner.stage_two.Fref_mtzcol)
 
-            if "miller_index" in list(refls.keys()):
-                is_allowed = flex.bool(len(refls), True)
-                allowed_hkls = set(self.SIM.crystal.miller_array.indices())
-                uc = self.SIM.crystal.dxtbx_crystal.get_unit_cell().parameters()
-                symb = self.SIM.crystal.space_group_info.type().lookup_symbol()
-                sym = crystal.symmetry(uc, symb)
-                mset = miller.set(sym, refls['miller_index'],True)
-                op = mset.change_of_basis_op_to_primitive_setting()
-                mset_p = mset.change_basis(op)
-                refl_hkls_p1 = mset_p.indices()
-
-                for i_ref in range(len(refls)):
-                    hkl_p1 = refl_hkls_p1[i_ref]
-                    if hkl_p1 not in allowed_hkls:
-                        is_allowed[i_ref] = False
-                refls = refls.select(is_allowed)
-
             LOGGER.info("EVENT: LOADING ROI DATA")
             shot_modeler = hopper_utils.DataModeler(self.params)
             shot_modeler.exper_name = exper_name
+            shot_modeler.exper_idx = exper_id
             shot_modeler.refl_name = refl_name
             shot_modeler.rank = COMM.rank
             if self.params.refiner.load_data_from_refl:
                 gathered = shot_modeler.GatherFromReflectionTable(expt, refls, sg_symbol=self.symbol)
             else:
+                # Note: no need to pass exper_id here because expt and refls have already been sliced out
                 gathered = shot_modeler.GatherFromExperiment(expt, refls, sg_symbol=self.symbol)
             if not gathered:
                 raise IOError("Failed to gather data from experiment %s", exper_name)
@@ -279,7 +283,6 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
                     assert np.allclose(new_Modeler.roi_id, all_roi_id)
                     LOGGER.info("Gathered file approved!")
 
-
             self.Hi[shot_idx] = shot_modeler.Hi
             self.Hi_asu[shot_idx] = shot_modeler.Hi_asu
 
@@ -322,19 +325,21 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
                 shot_modeler.originZ_init = exper_dataframe.detz_shift_mm.values[0]*1e-3
             else:
                 shot_modeler.originZ_init = 0
+            # TODO: is there a reason these 3 attribs are set once more after being set above?
             shot_modeler.exper_name = exper_name
+            shot_modeler.exper_idx = exper_id
             shot_modeler.refl_name = refl_name
 
             shot_panel_groups_refined = self.determine_refined_panel_groups(shot_modeler.pids)
             rank_panel_groups_refined = rank_panel_groups_refined.union(set(shot_panel_groups_refined))
 
             shot_idx += 1
-            if COMM.rank == 0:
-                self._mem_usage()
-                print("Finished loading image %d / %d" % (i_exp+1, len(exper_names)), flush=True)
+            LOGGER.info(utils.memory_report('Process memory usage'))
+            LOGGER.info("Finished loading image %d / %d (%d / %d)"
+                  % (i_df+1, len(exper_names), i_work+1, len(worklist))) #, flush=True)
 
             shot_modeler.PAR = PAR_from_params(self.params, expt, best=exper_dataframe)
-            self.Modelers[i_exp] = shot_modeler
+            self.Modelers[i_df] = shot_modeler  # TODO: verify that i_df as a key is ok everywhere
 
         LOGGER.info("DONE LOADING DATA; ENTER BARRIER")
         COMM.Barrier()
@@ -349,16 +354,18 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
             for set_of_panels in all_refined_groups:
                 panel_groups_refined = panel_groups_refined.union(set_of_panels)
         self.panel_groups_refined = list(COMM.bcast(panel_groups_refined))
+        LOGGER.info(utils.memory_report('Mem after panel groups'))
 
         LOGGER.info("EVENT: Gathering global HKL information")
         try:
             self._gather_Hi_information()
-        except TypeError:
+        except TypeError:  # TODO: should we siltently fail here ?
             pass
         LOGGER.info("EVENT: FINISHED gather global HKL information")
         if self.params.roi.cache_dir_only:
             print("Done creating cache directory and cache_dir_only=True, so goodbye.")
             sys.exit()
+        LOGGER.info(utils.memory_report('Mem after gather Hi info'))
 
         # in case of GPU
         LOGGER.info("BEGIN DETERMINE MAX PIX")
@@ -369,15 +376,10 @@ def load_inputs(self, pandas_table, miller_data=None, work_distribution=None, re
             n = max(n)
         self.NPIX_TO_ALLOC = COMM.bcast(n)
         LOGGER.info("DONE DETERMINE MAX PIX")
+        LOGGER.info(utils.memory_report('Mem after determine max num pix'))
 
         self.DEVICE_ID = COMM.rank % self.params.refiner.num_devices
-        self._mem_usage()
-
-    def _mem_usage(self):
-        memMB = get_memory_usage()
-        import socket
-        host = socket.gethostname()
-        print("Rank 0 reporting memory usage: %f GB on Rank 0 node %s" % (memMB / 1e3, host))
+        LOGGER.info(utils.memory_report('Mem after load_inputs'))
 
     def determine_refined_panel_groups(self, pids):
         refined_groups = []
@@ -394,8 +396,8 @@ def _determine_per_rank_max_num_pix(self):
             x1, x2, y1, y2 = map(np.array, zip(*modeler.rois))
             npix = np.sum((x2-x1)*(y2-y1))
             max_npix = max(npix, max_npix)
-            print("Rank %d, shot %d has %d pixels" % (COMM.rank, i_shot+1, npix))
-        print("Rank %d, max pix to be modeled: %d" % (COMM.rank, max_npix))
+            #print("Rank %d, shot %d has %d pixels" % (COMM.rank, i_shot+1, npix))
+        LOGGER.info("Rank %d, max pix to be modeled: %d" % (COMM.rank, max_npix))
         return max_npix
 
     def _try_loading_spectrum_filelist(self):
@@ -407,50 +409,37 @@ def _try_loading_spectrum_filelist(self):
         return file_list
 
     def _gather_Hi_information(self):
-        nshots_on_this_rank = len(self.Hi)
         # aggregate all miller indices
-        self.Hi_all_ranks, self.Hi_asu_all_ranks = [], []
-        # TODO assert list types are stored in Hi and Hi_asu
-        for i_shot in self.Hi: #range(nshots_on_this_rank):
-            self.Hi_all_ranks += self.Hi[i_shot]
-            self.Hi_asu_all_ranks += self.Hi_asu[i_shot]
-        self.Hi_all_ranks = COMM.reduce(self.Hi_all_ranks)
-        self.Hi_all_ranks = COMM.bcast(self.Hi_all_ranks)
 
-        self.Hi_asu_all_ranks = COMM.reduce(self.Hi_asu_all_ranks)
-        self.Hi_asu_all_ranks = COMM.bcast(self.Hi_asu_all_ranks)
+        self.hiasu = HiAsu(self)
 
-        marr_unique_h = self._get_unique_Hi()
+        # TODO Restore this diagnostics step within the scope of `HiAsu` class
+        # Hi_asu_all_ranks used to be a list of all Hi_asu from all ranks,
+        # (None of ranks > 0) but was removed when moving to new `HiAsu` class.
+        # marr_unique_h = self._get_unique_Hi(Hi_asu_all_ranks)
 
-        # this will map the measured miller indices to their index in the LBFGS parameter array self.x
-        self.idx_from_asu = {h: i for i, h in enumerate(set(self.Hi_asu_all_ranks))}
-        # we will need the inverse map during refinement to update the miller array in diffBragg, so we cache it here
-        self.asu_from_idx = {i: h for i, h in enumerate(set(self.Hi_asu_all_ranks))}
+        # TODO: I think this code does absolutely nothing, but might be useful (it was used for B-factor modeling, and maybe more..)
+        # fres = marr_unique_h.d_spacings()
+        # self.res_from_asu = {h: res for h, res in zip(fres.indices(), fres.data())}
+        # TODO: End of code I think does absolutely nothing
 
-        self.num_hkl_global = len(self.idx_from_asu)
+    def get_first_modeller_symmetry(self):
+        uc = next(iter(self.Modelers.values())).ucell_man
+        params = uc.a, uc.b, uc.c, uc.al * 180 / np.pi, uc.be * 180 / np.pi, uc.ga * 180 / np.pi
+        if self.params.refiner.force_unit_cell is not None:
+            params = self.params.refiner.force_unit_cell
+        return crystal.symmetry(unit_cell=params, space_group_symbol=self.symbol)
 
-        fres = marr_unique_h.d_spacings()
-        self.res_from_asu = {h: res for h, res in zip(fres.indices(), fres.data())}
-
-    def _get_unique_Hi(self):
-        COMM.barrier()
+    def _get_unique_Hi(self, Hi_asu_all_ranks):
         if COMM.rank == 0:
-            from cctbx.crystal import symmetry
-            from cctbx import miller
             from cctbx.array_family import flex as cctbx_flex
 
-            ii = list(self.Modelers.keys())[0]
-            uc = self.Modelers[ii].ucell_man
-            params = uc.a, uc.b, uc.c, uc.al * 180 / np.pi, uc.be * 180 / np.pi, uc.ga * 180 / np.pi
-            if self.params.refiner.force_unit_cell is not None:
-                params = self.params.refiner.force_unit_cell
-            symm = symmetry(unit_cell=params, space_group_symbol=self.symbol)
-            hi_asu_flex = cctbx_flex.miller_index(self.Hi_asu_all_ranks)
+            symm = self.get_first_modeller_symmetry()
+            hi_asu_flex = cctbx_flex.miller_index(Hi_asu_all_ranks)
             mset = miller.set(symm, hi_asu_flex, anomalous_flag=True)
             marr = miller.array(mset)
             binner = marr.setup_binner(d_max=self.params.refiner.stage_two.d_max, d_min=self.params.refiner.stage_two.d_min,
                                        n_bins=self.params.refiner.stage_two.n_bin)
-            from collections import Counter
             print("Average multiplicities:")
             print("<><><><><><><><><><><><>")
             for i_bin in range(self.params.refiner.stage_two.n_bin - 1):
@@ -462,15 +451,15 @@ def _get_unique_Hi(self):
                     print("\t %d refls with multi %d" % (sum(multi_in_bin == ii), ii))
 
             print("Overall completeness\n<><><><><><><><>")
-            symm = symmetry(unit_cell=params, space_group_symbol=self.symbol)
-            hi_flex_unique = cctbx_flex.miller_index(list(set(self.Hi_asu_all_ranks)))
+            unique_Hi_asu = set(Hi_asu_all_ranks)
+            hi_flex_unique = cctbx_flex.miller_index(list(unique_Hi_asu))
             mset = miller.set(symm, hi_flex_unique, anomalous_flag=True)
             self.binner = mset.setup_binner(d_min=self.params.refiner.stage_two.d_min,
                                             d_max=self.params.refiner.stage_two.d_max,
                                             n_bins=self.params.refiner.stage_two.n_bin)
             mset.completeness(use_binning=True).show()
             marr_unique_h = miller.array(mset)
-            print("Rank %d: total miller vars=%d" % (COMM.rank, len(set(self.Hi_asu_all_ranks))))
+            print("Rank %d: total miller vars=%d" % (COMM.rank, len(unique_Hi_asu)))
         else:
             marr_unique_h = None
 
@@ -486,6 +475,7 @@ def _launch(self):
         x_init = None
         nmacro = self.params.refiner.num_macro_cycles
         n_trials = len(self.params.refiner.max_calls)
+
         for i_trial in range(n_trials*nmacro):
 
             self.RUC = StageTwoRefiner(self.Modelers, self.symbol, self.params)
@@ -511,8 +501,7 @@ def _launch(self):
             self.RUC.log_fcells = True
             self.RUC.request_diag_once = False
             self.RUC.trad_conv = True
-            self.RUC.idx_from_asu = self.idx_from_asu
-            self.RUC.asu_from_idx = self.asu_from_idx
+            self.RUC.hiasu = self.hiasu
 
             self.RUC.S = self.SIM
             self.RUC.restart_file = self.params.refiner.io.restart_file
@@ -527,7 +516,9 @@ def _launch(self):
                 self.RUC.S.update_nanoBragg_instance('verbose', self.params.refiner.verbose)
 
             LOGGER.info("_launch run setup")
+            LOGGER.info(utils.memory_report('Mem usage before _setup'))
             self.RUC.run(setup_only=True)
+            LOGGER.info(utils.memory_report('Mem usage after _setup'))
             LOGGER.info("_launch done run setup")
             # for debug purposes:
             #if not self.params.refiner.quiet:
@@ -552,9 +543,10 @@ def _launch(self):
             #        more_sel_flags[i_shot] = [flag1 and flag2 for flag1,flag2 in zip(sel_flags, res_flags)]
             #    self.RUC.selection_flags = more_sel_flags
 
-            LOGGER.info("_launcher runno setup")
+            LOGGER.info("_launcher running optimization")
+
             self.RUC.run(setup=False)
-            LOGGER.info("_launcher done runno setup")
+            LOGGER.info("_launcher done running optimization")
             if self.RUC.hit_break_to_use_curvatures:
                 self.RUC.fix_params_with_negative_curvature = False
                 self.RUC.num_positive_curvatures = 0
@@ -576,8 +568,81 @@ def _launch(self):
 
             if self.params.profile:
                 self.RUC.S.D.show_timings(self.RUC.rank)
-            if os.environ.get("DIFFBRAGG_USE_CUDA") is not None:
+            if os.environ.get("DIFFBRAGG_USE_CUDA") is not None or os.environ.get("DIFFBRAGG_USE_KOKKOS") is not None:
                 self.RUC.S.D.gpu_free()
 
     def will_refine(self, param):
         return param is not None and any(param)
+
+
+class HiAsu(object):
+    """
+    Object which stores possible & present Miller Indices, their counts,
+    counters, maps between them and their integer indexes etc.
+
+    Within `HiAsu`, the following notation is used for parameter names:
+    * no trailing `_`: global parameter - total or the same for all ranks.
+    * with trailing `_`: local parameter – value is unique for each rank.
+    Example: `counts_` would be specific to rank, but `counts` would be global.
+    """
+    def __init__(self, refine_launcher):
+        self.rl = refine_launcher
+        self.possible = self.get_possible()
+        self.possible_len = len(self.possible)
+        self.possible_counts = self.get_counts()
+        self.present_len = len(list(self.present_zip))
+        self.from_idx, self.to_idx = self._get_dicts()
+
+    def get_possible(self):
+        if COMM.rank == 0:
+            sym = self.rl.get_first_modeller_symmetry()
+            res_ranges_str = self.rl.params.refiner.res_ranges
+            if res_ranges_str:
+                res_ranges = utils.parse_reso_string(res_ranges_str)
+                d_min = min([d_min for d_min, _ in res_ranges])
+            else:
+                expt = next(iter(self.rl.Modelers.values())).E
+                det, s0 = expt.detector, expt.beam.get_s0()
+                d_min = min([p.get_max_resolution_at_corners(s0) for p in det])
+            d_min *= 0.8  # accommodate variations in uc or det across expts
+            mset_full = sym.build_miller_set(anomalous_flag=True, d_min=d_min)
+            possible = list(mset_full.indices())
+        else:
+            possible = None
+        return COMM.bcast(possible, root=0)
+
+    def get_counts(self):
+        hi_asu_ = chain.from_iterable(self.rl.Hi_asu.values())
+        hi_asu_counter_ = Counter(hi_asu_)
+        hi_asu_possible_counts_ = [hi_asu_counter_[k] for k in self.possible]
+        hi_asu_possible_counts_ = np.array(hi_asu_possible_counts_, dtype=np.uint16)
+        hi_asu_possible_counts = np.zeros_like(hi_asu_possible_counts_, dtype=np.uint16)
+        COMM.Allreduce(hi_asu_possible_counts_, hi_asu_possible_counts, op=MPI.SUM)
+        return hi_asu_possible_counts
+
+    @property
+    def present(self):
+        return (p for p, c in self.present_zip)
+
+    @property
+    def present_counts(self):
+        return (c for p, c in self.present_zip)
+
+    @property
+    def present_counter(self):
+        return Counter({p: c for p, c in self.present_zip})
+
+    @property
+    def present_idx_counter(self):
+        return Counter({self.to_idx[p]: c for p, c in self.present_zip})
+
+    @property
+    def present_zip(self):
+        return ((p, c) for p, c in zip(self.possible, self.possible_counts) if c)
+
+    def _get_dicts(self):
+        """from_idx maps miller indices to index in LBFGS par. array self.x;
+        to_ids is an inverse map during refinement to update diffBragg m.arr"""
+        from_idx = {i: h for i, h in enumerate(self.present)}
+        to_idx = {h: i for i, h in enumerate(self.present)}
+        return from_idx, to_idx
diff --git a/simtbx/diffBragg/hopper_ensemble_utils.py b/simtbx/diffBragg/hopper_ensemble_utils.py
index 0eba59a7c3..f15f608d10 100644
--- a/simtbx/diffBragg/hopper_ensemble_utils.py
+++ b/simtbx/diffBragg/hopper_ensemble_utils.py
@@ -15,7 +15,6 @@
 from cctbx import miller, crystal, sgtbx
 from dials.array_family import flex
 from dxtbx.model import ExperimentList
-from xfel.merging.application.utils.memory_usage import get_memory_usage
 
 
 COMM = MPI.COMM_WORLD
@@ -80,9 +79,12 @@ def __call__(self, x, *args, **kwargs):
         min_info = "it=%d | t/it=%.4fs | F=%10.7g | sigZ=%10.7g" \
                   % (self.niter,self.ave_t_per_iter, f, ave_zscore_sig)
         if COMM.rank==0:
-            print(min_info, flush=True)
+            #print(min_info, flush=True)
+            MAIN_LOGGER.info(min_info)
         if modelers.save_freq is not None and self.niter % modelers.save_freq == 0:
             modelers.save_up(self.x0, ref_iter=self.niter)
+            if modelers.SIM.D.record_timings:
+                modelers.SIM.D.show_timings()
 
         return f
 
@@ -106,12 +108,13 @@ def target_func(x, modelers):
     g_fhkl = np.zeros(num_fhkl_params)
     zscore_sigs = []
     fcell_params = x[-num_fhkl_params:]
-    for i_shot in modelers:
+    for ii, i_shot in enumerate(modelers):
         shot_modeler = modelers[i_shot]
         shot_x_slice = modelers.x_slices[i_shot]
         per_shot_params = x[shot_x_slice]
         x_for_shot = np.hstack((per_shot_params, fcell_params))
-        model_bragg, Jac = hopper_utils.model(x_for_shot, shot_modeler, modelers.SIM, compute_grad=True, update_spectrum=True)
+        model_bragg, Jac = hopper_utils.model(x_for_shot, shot_modeler, modelers.SIM, compute_grad=True, update_spectrum=True,
+                                              update_Fhkl_scales=ii==0)
 
         model_pix = model_bragg + shot_modeler.all_background
 
@@ -367,6 +370,8 @@ def prep_for_refinement(self):
 
         self._set_mtz_data()
         self.set_device_id()
+
+    def alloc_max_pix_per_shot(self):
         self._mpi_set_allocation_volume()
 
     def _get_fhkl_vary_flags(self):
@@ -387,10 +392,13 @@ def _get_fhkl_vary_flags(self):
         all_nominal_hkl = set()
         for mod in self.data_modelers.values():
             all_nominal_hkl = all_nominal_hkl.union(mod.hi_asu_perpix)
+        #TODO : is this memory intensive?
         all_nominal_hkl = COMM.gather(all_nominal_hkl)
         if COMM.rank == 0:
+            # TODO: all_nominal_hkl is P1, asu_map_int is non-P1
             all_nominal_hkl = set(all_nominal_hkl[0]).union(*all_nominal_hkl[1:])
-            asu_inds_to_vary = [self.SIM.asu_map_int[h] for h in all_nominal_hkl]
+            all_nominal_hkl_sym = utils.map_hkl_list(all_nominal_hkl, True, self.SIM.crystal.symbol)
+            asu_inds_to_vary = [self.SIM.asu_map_int[h] for h in all_nominal_hkl_sym]
         else:
             asu_inds_to_vary = None
         asu_inds_to_vary = set(COMM.bcast(asu_inds_to_vary))
@@ -564,13 +572,6 @@ def save_up(self, x, ref_iter=None):
                 mod.params.tag = temp
 
 
-def mem_usage(rank):
-    if COMM.rank == rank:
-        memMB = get_memory_usage()
-        host = socket.gethostname()
-        MAIN_LOGGER.info("Rank %d reporting memory usage: %f GB on Rank 0 node %s" % (COMM.rank, memMB / 1e3, host))
-
-
 def get_gather_name(exper_name, gather_dir):
     gathered_name = os.path.splitext(os.path.basename(exper_name))[0]
     gathered_name += "_withData.refl"
@@ -579,9 +580,10 @@ def get_gather_name(exper_name, gather_dir):
 
 
 def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictions',
-                gather_dir=None):
+                gather_dir=None, exper_idx_key="exp_idx"):
 
-    work_distribution = prep_dataframe(pandas_table, refls_key)
+    work_distribution = prep_dataframe(pandas_table, refls_key,
+                                       res_ranges_string=params.refiner.res_ranges)
     COMM.barrier()
     num_exp = len(pandas_table)
     first_exper_file = pandas_table[exper_key].values[0]
@@ -600,7 +602,9 @@ def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictio
                          % (COMM.size, num_exp, num_exp))
 
     exper_names = pandas_table[exper_key]
-    assert len(exper_names) == len(set(exper_names))
+    exper_ids = pandas_table[exper_idx_key]
+    name_ids = list(zip(exper_names, exper_ids))
+    assert len(name_ids) == len(set(name_ids))
     worklist = work_distribution[COMM.rank]
     MAIN_LOGGER.info("EVENT: begin loading inputs")
 
@@ -609,20 +613,22 @@ def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictio
     Fhkl_model = Fhkl_model.expand_to_p1().generate_bijvoet_mates()
     Fhkl_model_indices = set(Fhkl_model.indices())
     shot_modelers = hopper_ensemble_utils.DataModelers()
-    for ii, i_exp in enumerate(worklist):
-        exper_name = exper_names[i_exp]
+    for ii, i_df in enumerate(worklist):
+        exper_name = exper_names[i_df]
+        exper_id = int(exper_ids[i_df])
         MAIN_LOGGER.info("EVENT: BEGIN loading experiment list")
-        expt_list = ExperimentList.from_file(exper_name, check_format=params.refiner.check_expt_format)
+        check_format = not params.refiner.load_data_from_refl
+        expt = hopper_utils.DataModeler.exper_json_single_file(exper_name, exper_id, check_format)
+        expt_list = ExperimentList()
+        expt_list.append(expt)
         MAIN_LOGGER.info("EVENT: DONE loading experiment list")
-        if len(expt_list) != 1:
-            MAIN_LOGGER.critical("Input experiments need to have length 1, %s does not" % exper_name)
-        expt = expt_list[0]
         expt.detector = detector  # in case of supplied ref geom
 
-        exper_dataframe = pandas_table.query("%s=='%s'" % (exper_key, exper_name))
+        exper_dataframe = pandas_table.query("%s=='%s'" % (exper_key, exper_name)).query("%s==%d" % (exper_idx_key, exper_id))
 
         refl_name = exper_dataframe[refls_key].values[0]
         refls = flex.reflection_table.from_file(refl_name)
+        refls = refls.select(refls['id'] == exper_id)
 
         miller_inds = list( refls['miller_index'])
         is_not_000 = [h != (0, 0, 0) for h in miller_inds]
@@ -632,7 +638,7 @@ def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictio
         refls = refls.select(refl_sel)
 
         exp_cry_sym = expt.crystal.get_space_group().type().lookup_symbol()
-        if exp_cry_sym.replace(" ", "") != params.space_group:
+        if params.space_group is not None and exp_cry_sym.replace(" ", "") != params.space_group:
             gr = sgtbx.space_group_info(params.space_group).group()
             expt.crystal.set_space_group(gr)
             #raise ValueError("Crystals should all have the same space group symmetry")
@@ -640,6 +646,7 @@ def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictio
         MAIN_LOGGER.info("EVENT: LOADING ROI DATA")
         shot_modeler = hopper_utils.DataModeler(params)
         shot_modeler.exper_name = exper_name
+        shot_modeler.exper_idx = exper_id
         shot_modeler.refl_name = refl_name
         shot_modeler.rank = COMM.rank
         if params.refiner.load_data_from_refl:
@@ -678,13 +685,13 @@ def load_inputs(pandas_table, params, exper_key="exp_name", refls_key='predictio
             continue
 
         shot_modeler.set_parameters_for_experiment(best=exper_dataframe)
-        shot_modeler.set_spectrum()
+        shot_modeler.set_spectrum(spectra_file=exper_dataframe.spectrum_filename.values[0])
         MAIN_LOGGER.info("Will simulate %d energy channels" % len(shot_modeler.nanoBragg_beam_spectrum))
 
         # verify this
         shot_modeler.Umatrices = [shot_modeler.E.crystal.get_U()]
 
-        mem_usage(0)
+        MAIN_LOGGER.info(utils.memory_report('Rank 0 reporting memory usage'))
         if COMM.rank==0:
             print("Finished loading image %d / %d" % (ii + 1, len(worklist)), flush=True)
 
diff --git a/simtbx/diffBragg/hopper_io.py b/simtbx/diffBragg/hopper_io.py
index da9316e37e..5285c68de1 100644
--- a/simtbx/diffBragg/hopper_io.py
+++ b/simtbx/diffBragg/hopper_io.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 
-def save_expt_refl_file(filename, expts, refls, specs=None, check_exists=False):
+def save_expt_refl_file(filename, expts, refls, specs=None, check_exists=False, indices=None):
     """
     Save an input file for bg_and_probOri (the EMC initializer script)
     expt and refl names will be given absolute paths
@@ -18,12 +18,15 @@ def save_expt_refl_file(filename, expts, refls, specs=None, check_exists=False):
     :param refls: list of reflection tables
     :param specs: optional list of spectrum .lam files
     :param check_exists: ensure files actually exist
+    :param indices: experiment indices if multiple images per experiment
     :return:
     """
     if specs is None:
         specs = [None]*len(expts)
+    if indices is None:
+        indices = [None] *len(indices)
     with open(filename, "w") as o:
-        for expt, refl, spec in zip(expts, refls, specs):
+        for expt, refl, spec, idx in zip(expts, refls, specs, indices):
             expt = os.path.abspath(expt)
             refl = os.path.abspath(refl)
             if spec is not None:
@@ -33,10 +36,15 @@ def save_expt_refl_file(filename, expts, refls, specs=None, check_exists=False):
                 assert os.path.exists(refl)
                 if spec is not None:
                     assert os.path.exists(spec)
-            if spec is not None:
-                o.write("%s %s %s\n" % (expt, refl, spec))
+            if spec is None:
+                spec = ""
+            else:
+                spec = " %s" %spec
+            if idx is None:
+                idx = ""
             else:
-                o.write("%s %s\n" % (expt, refl))
+                idx = " %d" % idx
+            o.write("%s %s%s%s\n" % (expt, refl, spec, idx))
 
 
 def make_rank_outdir(root, subfolder, rank=0):
@@ -59,11 +67,31 @@ def diffBragg_Umat(rotX, rotY, rotZ, U):
     return U
 
 
-def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_refls, stg1_img_path,
-                   rank=0):
+def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_refls, stg1_img_path=None,
+                   rank=0, write_expt=True, write_pandas=True, exp_idx=0):
+    """
+
+    :param x: the optiized parameters used by hopper (output of Minimize)
+    :param Mod: the instance of the hopper_utils.DataModeler that was used by hopper
+    :param SIM: the instance of nanoBragg/sim_data.SimData that was used by hopper
+    :param orig_exp_name: the name of the experiment list that was input to hopper
+    :param params: the diffBragg hopper parameters
+    :param expt: the data modeler experiment
+    :param rank_exp_idx: order this shot was processed by this MPI rank #TODO rename this
+    :param stg1_refls: path to the refls that were input to hopper
+    :param stg1_img_path: leave as None, no longer used
+    :param rank: MPI rank
+    :param write_expt: whether to write the single shot experiment
+    :param write_pandas: whether to write the single shot dataframe
+    :param exp_idx: the index of the experiment within the experiment list (orig_exp_name)
+    :return: the single shot dataframe
+    """
     LOGGER = logging.getLogger("refine")
-    rank_exper_outdir = make_rank_outdir(params.outdir, "expers",rank)
-    rank_pandas_outdir = make_rank_outdir(params.outdir, "pandas",rank)
+    opt_exp_path = None
+    basename = os.path.splitext(os.path.basename(orig_exp_name))[0]
+    if write_expt:
+        rank_exper_outdir = make_rank_outdir(params.outdir, "expers",rank)
+        opt_exp_path = os.path.join(rank_exper_outdir, "%s_%s_%d.expt" % (params.tag, basename, rank_exp_idx))
 
     scale, rotX, rotY, rotZ, Na, Nb, Nc, Nd, Ne, Nf,\
         diff_gam_a, diff_gam_b, diff_gam_c, diff_sig_a, \
@@ -109,9 +137,6 @@ def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_
             rotY_xt = par['rotY']
             rotZ_xt = par['rotZ']
             U_xt = diffBragg_Umat(rotX_xt, rotY_xt, rotZ_xt, SIM.Umatrices[i_xtal])
-            #cryst_temp = deepcopy(new_cryst)
-            #cryst_temp.set_U(U_xt)
-            #Amat_xt = cryst_temp.get_A()
             other_Umats.append(U_xt)
             other_spotscales.append(scale_xt)
 
@@ -128,9 +153,6 @@ def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_
                     lam_coefs.append([val])
             lam_coefs = tuple(lam_coefs)
 
-    basename = os.path.splitext(os.path.basename(orig_exp_name))[0]
-    opt_exp_path = os.path.join(rank_exper_outdir, "%s_%s_%d.expt" % (params.tag, basename, rank_exp_idx))
-    pandas_path = os.path.join(rank_pandas_outdir, "%s_%s_%d.pkl" % (params.tag, basename, rank_exp_idx))
     new_expt = Experiment()
     new_expt.crystal = new_cryst
     new_expt.detector = expt.detector
@@ -140,8 +162,9 @@ def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_
     # expt.detector = refiner.get_optimized_detector()
     new_exp_list = ExperimentList()
     new_exp_list.append(new_expt)
-    new_exp_list.as_file(opt_exp_path)
-    LOGGER.debug("saved opt_exp %s with wavelength %f" % (opt_exp_path, expt.beam.get_wavelength()))
+    if write_expt:
+        new_exp_list.as_file(opt_exp_path)
+        LOGGER.debug("saved opt_exp %s with wavelength %f" % (opt_exp_path, expt.beam.get_wavelength()))
     _,flux_vals = zip(*SIM.beam.spectrum)
 
     df = single_expt_pandas(xtal_scale=scale, Amat=Amat,
@@ -166,14 +189,21 @@ def save_to_pandas(x, Mod, SIM, orig_exp_name, params, expt, rank_exp_idx, stg1_
         opt_det=params.opt_det, stg1_refls=stg1_refls,
         stg1_img_path=stg1_img_path,
         ncells_init=Nabc_init, spot_scales_init=scale_init,
-        other_Umats = other_Umats, other_spotscales = other_spotscales)
+        other_Umats = other_Umats, other_spotscales = other_spotscales,
+        num_mosaicity_samples=params.simulator.crystal.num_mosaicity_samples)
+
+    df['exp_idx'] = exp_idx
+
     if hasattr(Mod, "sigz"):
         df['sigz'] = [Mod.sigz]
     if hasattr(Mod, "niter"):
         df['niter'] = [Mod.niter]
     df['phi_deg'] = SIM.D.phi_deg
     df['osc_deg'] = SIM.D.osc_deg
-    df.to_pickle(pandas_path)
+    if write_pandas:
+        rank_pandas_outdir = make_rank_outdir(params.outdir, "pandas",rank)
+        pandas_path = os.path.join(rank_pandas_outdir, "%s_%s_%d.pkl" % (params.tag, basename, rank_exp_idx))
+        df.to_pickle(pandas_path)
     return df
 
 
@@ -183,7 +213,7 @@ def single_expt_pandas(xtal_scale, Amat, ncells_abc, ncells_def, eta_abc,
                        spec_file, spec_stride,flux, beamsize_mm,
                        orig_exp_name, opt_exp_name, spec_from_imageset, oversample,
                        opt_det, stg1_refls, stg1_img_path, ncells_init=None, spot_scales_init = None,
-                       other_Umats=None, other_spotscales=None):
+                       other_Umats=None, other_spotscales=None, num_mosaicity_samples=None):
     """
 
     :param xtal_scale:
@@ -212,6 +242,7 @@ def single_expt_pandas(xtal_scale, Amat, ncells_abc, ncells_def, eta_abc,
     :param opt_det:
     :param stg1_refls:
     :param stg1_img_path:
+    :num_mosaicity_samples:
     :return:
     """
     if other_Umats is None:
@@ -253,11 +284,16 @@ def single_expt_pandas(xtal_scale, Amat, ncells_abc, ncells_def, eta_abc,
         df["other_spotscales"] = [tuple(other_spotscales)]
     if other_Umats:
         df["other_Umats"] = [tuple(map(tuple, other_Umats))]
+    if num_mosaicity_samples is not None:
+        df['num_mosaicity_samples'] = num_mosaicity_samples
 
     df["total_flux"] = flux
     df["beamsize_mm"] = beamsize_mm
     df["exp_name"] = os.path.abspath(orig_exp_name)
-    df["opt_exp_name"] = os.path.abspath(opt_exp_name)
+
+    if opt_exp_name is not None:
+        opt_exp_name = os.path.abspath(opt_exp_name)
+    df["opt_exp_name"] = opt_exp_name
     df["spectrum_from_imageset"] = spec_from_imageset
     df["oversample"] = oversample
     if opt_det is not None:
diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 1b29754898..5acd3ced80 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function
 import time
 import os
+import json
 from dials.algorithms.shoebox import MaskCode
 from copy import deepcopy
 from dials.model.data import Shoebox
@@ -14,7 +15,10 @@
 from scipy.ndimage import binary_dilation
 from dxtbx.model.experiment_list import ExperimentListFactory
 from dxtbx.model import Spectrum
-from serialtbx.detector.jungfrau import get_pedestalRMS_from_jungfrau
+try:  # TODO keep backwards compatibility until we close the nxmx_writer_experimental branch
+    from serialtbx.detector.jungfrau import get_pedestalRMS_from_jungfrau
+except ModuleNotFoundError:
+    from xfel.util.jungfrau import get_pedestalRMS_from_jungfrau
 from simtbx.nanoBragg.utils import downsample_spectrum
 from dials.array_family import flex
 from simtbx.diffBragg import utils
@@ -137,6 +141,8 @@ def __init__(self, params):
         self.all_freq = None  # flag for the h,k,l frequency of the observed pixel
         self.best_model = None  # best model value at each pixel
         self.best_model_includes_background = False  # whether the best model includes the background scattering estimate
+        self.all_nominal_hkl_p1 = None  # nominal p1 hkl at each pixel
+        self.all_nominal_hkl = None  # nominal hkl at each pixel
         self.all_data =None  # data at each pixel (photon units)
         self.all_sigma_rdout = None  # this is either a float or an array. if the phil param use_perpixel_dark_rms=True, then these are different per pixel, per shot
         self.all_gain = None  # gain value per pixel (used during diffBragg/refiners/stage_two_refiner)
@@ -146,6 +152,7 @@ def __init__(self, params):
         self.all_fast =None  # fast-scan coordinate per pixel
         self.all_slow =None  # slow-scan coordinate per pixel
         self.all_pid = None  # panel id per pixel
+        self.all_zscore = None # the estimated z-score values for each pixel, updated each iteration in the Target class
         self.rois=None  # region of interest (per spot)
         self.pids=None  # panel id (per spot)
         self.tilt_abc=None  # background plane constants (per spot), a,b are fast,slow scan components, c is offset
@@ -158,6 +165,7 @@ def __init__(self, params):
         self.exper_name = None  # optional name specifying where dxtbx.model.Experiment was loaded from
         self.refl_name = None  # optional name specifying where dials.array_family.flex.reflection_table refls were loaded from
         self.spec_name = None  # optional name specifying spectrum file(.lam)
+        self.exper_idx = 0  # optional number specifying the index of the experiment in the experiment list
         self.rank = 0  # in case DataModelers are part of an MPI program, have a rank attribute for record keeping
 
         self.Hi = None  # miller index (P1)
@@ -171,6 +179,49 @@ def __init__(self, params):
                       "Hi", "Hi_asu", "roi_id", "params", "all_pid", "all_fast", "all_slow", "best_model_includes_background",
                       "all_q_perpix", "all_sigma_rdout"]
 
+    def filter_pixels(self, thresh):
+        assert self.roi_id is not None
+        assert self.all_trusted is not None
+        assert self.all_zscore is not None
+
+        if not hasattr(self, 'roi_id_slices') or self.roi_id_slices is None:
+            self.set_slices('roi_id')
+
+        ntrust = self.all_trusted.sum()
+
+        sigz_per_shoebox = []
+        for roi_id in self.roi_id_unique:
+            slcs = self.roi_id_slices[roi_id]
+            Zs = []
+            for slc in slcs:
+                trusted = self.all_trusted[slc]
+                Zs += list(self.all_zscore[slc][trusted])
+            if not Zs:
+                sigz = np.nan
+            else:
+                sigz = np.std(Zs)
+            sigz_per_shoebox.append(sigz)
+        if np.all(np.isnan(sigz_per_shoebox)):
+            MAIN_LOGGER.debug("All shoeboxes are nan, nothing to filter")
+            return
+        med_sigz = np.median([sigz for sigz in sigz_per_shoebox if not np.isnan(sigz)])
+        sigz_per_shoebox = np.nan_to_num(sigz_per_shoebox, nan=med_sigz)
+        shoebox_is_bad = utils.is_outlier(sigz_per_shoebox, thresh)
+        nbad_pix = 0
+        for i_roi, roi_id in enumerate(self.roi_id_unique):
+            if shoebox_is_bad[i_roi]:
+                for slc in self.roi_id_slices[roi_id]:
+                    nbad_pix += slc.stop - slc.start
+                    self.all_trusted[slc] = False
+
+        #inds = np.arange(len(self.all_trusted))
+        #Zs = self.all_zscore[self.all_trusted]
+        #bad = utils.is_outlier(Zs, thresh=thresh)
+        #inds_trusted = inds[self.all_trusted]
+        #self.all_trusted[inds_trusted[bad]] = False
+        MAIN_LOGGER.debug("Added %d pixels from %d shoeboxes to the untrusted list (%d / %d trusted pixels remain)"
+                          % (nbad_pix, shoebox_is_bad.sum(), self.all_trusted.sum(), len(self.all_trusted)))
+
     def set_spectrum(self, spectra_file=None, spectra_stride=None, total_flux=None):
 
         # note , the following 3 settings will only be used if spectrum_from_imageset is False and gause_spec is False
@@ -211,11 +262,11 @@ def at_minimum(self, x, f, accept):
         #self.target.minima.append((f,self.target.x0,accept))
         self.target.lowest_x = x
         try:
-            # TODO get SIM and i_exp in here so we can save_up each new global minima!
+            # TODO get SIM and i_shot in here so we can save_up each new global minima!
             if f < self.target.lowest_f:
                  self.target.lowest_f = f
                  MAIN_LOGGER.info("New minimum found!")
-                 self.save_up(self.target.x0, SIM, self.rank, i_exp=i_exp)
+                 self.save_up(self.target.x0, SIM, self.rank, i_shot=i_shot)
         except NameError:
             pass
 
@@ -275,9 +326,48 @@ def sigma_rdout(self):
     def clean_up(self, SIM):
         free_SIM_mem(SIM)
 
-    def set_experiment(self, exp, load_imageset=True):
+    @staticmethod
+    def exper_json_single_file(exp_file, i_exp=0, check_format=True):
+        """
+        load a single experiment from an exp_file
+        If working with large combined experiment files, we only want to load
+        one image at a time on each MPI rank, otherwise at least one rank would need to
+        load the entire file into memory.
+        :param exp_file: experiment list file
+        :param i_exp: experiment id
+        :param check_format: bool, verifies the format class of the experiment, set to False if loading data from refls
+        :return:
+        """
+        exper_json = json.load(open(exp_file))
+        nexper = len(exper_json["experiment"])
+        assert 0 <= i_exp < nexper
+
+        this_exper = exper_json["experiment"][i_exp]
+
+        new_json = {'__id__': "ExperimentList", "experiment": [deepcopy(this_exper)]}
+
+        for model in ['beam', 'detector', 'crystal', 'imageset', 'profile', 'scan', 'goniometer', 'scaling_model']:
+            if model in this_exper:
+                model_index = this_exper[model]
+                new_json[model] = [exper_json[model][model_index]]
+                new_json["experiment"][0][model] = 0
+            else:
+                new_json[model] = []
+        explist = ExperimentListFactory.from_dict(new_json, check_format=check_format)
+        assert len(explist) == 1
+        return explist[0]
+
+    def set_experiment(self, exp, load_imageset=True, exp_idx=0):
+        """
+        :param exp: experiment or filename
+        :param load_imageset: whether to load the imageset (usually True)
+        :param exp_idx: index corresponding to experiment in experiment list
+        """
         if isinstance(exp, str):
-            self.E = ExperimentListFactory.from_json_file(exp, load_imageset)[0]
+            if not load_imageset:
+                self.E = ExperimentListFactory.from_json_file(exp, False)[exp_idx]
+            else:
+                self.E = self.exper_json_single_file(exp, exp_idx)
         else:
             self.E = exp
         if self.params.opt_det is not None:
@@ -290,9 +380,15 @@ def set_experiment(self, exp, load_imageset=True):
             self.E.beam = opt_beam_E.beam
             MAIN_LOGGER.info("Set the optimal beam from %s" % self.params.opt_beam)
 
-    def load_refls(self, ref):
+    def load_refls(self, ref, exp_idx=0):
+        """
+        :param ref: reflection table or filename
+        :param exp_idx: index corresponding to experiment in experiment list
+        """
         if isinstance(ref, str):
             refls = flex.reflection_table.from_file(ref)
+            # TODO: is this the proper way to select the id ?
+            refls = refls.select(refls['id']==exp_idx)
         else:
             # assert is a reflection table. ..
             refls = ref
@@ -311,6 +407,7 @@ def is_duplicate_hkl(self, refls):
         return is_duplicate
 
     def GatherFromReflectionTable(self, exp, ref, sg_symbol=None):
+
         self.set_experiment(exp, load_imageset=False)
         self.refls = self.load_refls(ref)
         nref = len(self.refls)
@@ -360,7 +457,8 @@ def GatherFromReflectionTable(self, exp, ref, sg_symbol=None):
             else:
                 sb_trust = np.logical_or(mask==fg_code, mask==bg_code)
 
-            below_zero = sb_bkgrnd <= 0
+            # below_zero = sb_bkgrnd <= 0
+            below_zero = sb_bkgrnd < 0
             if np.any(below_zero):
                 nbelow = np.sum(below_zero)
                 ntot = sb_bkgrnd.size
@@ -390,10 +488,19 @@ def GatherFromReflectionTable(self, exp, ref, sg_symbol=None):
         self.data_to_one_dim(img_data, is_trusted, background)
         return True
 
-    def GatherFromExperiment(self, exp, ref, remove_duplicate_hkl=True, sg_symbol=None):
-        self.set_experiment(exp, load_imageset=True)
+    def GatherFromExperiment(self, exp, ref, remove_duplicate_hkl=True, sg_symbol=None, exp_idx=0):
+        """
+
+        :param exp: experiment list filename , or experiment object
+        :param ref: reflection table filename, or reflection table instance
+        :param remove_duplicate_hkl: search for miller index duplicates and remove
+        :param sg_symbol: space group lookup symbol P43212
+        :param exp_idx: index of the experiment in the experiment list
+        :return:
+        """
+        self.set_experiment(exp, load_imageset=True, exp_idx=exp_idx)
 
-        refls = self.load_refls(ref)
+        refls = self.load_refls(ref, exp_idx=exp_idx)
         if len(refls)==0:
             MAIN_LOGGER.warning("no refls loaded!")
             return False
@@ -471,6 +578,9 @@ def GatherFromExperiment(self, exp, ref, remove_duplicate_hkl=True, sg_symbol=No
         self.tilt_abc = [abc for i_roi, abc in enumerate(self.tilt_abc) if self.selection_flags[i_roi]]
         self.pids = [pid for i_roi, pid in enumerate(self.pids) if self.selection_flags[i_roi]]
         self.tilt_cov = [cov for i_roi, cov in enumerate(self.tilt_cov) if self.selection_flags[i_roi]]
+        self.Hi =[hi for i_roi, hi in enumerate(self.Hi) if self.selection_flags[i_roi]]
+        self.Hi_asu =[hi_asu for i_roi, hi_asu in enumerate(self.Hi_asu) if self.selection_flags[i_roi]]
+
         if not self.no_rlp_info:
             self.Q = [np.linalg.norm(refls[i_roi]["rlp"]) for i_roi in range(len(refls)) if self.selection_flags[i_roi]]
 
@@ -560,8 +670,10 @@ def data_to_one_dim(self, img_data, is_trusted, background):
             if not self.no_rlp_info:
                 all_q_perpix += [self.Q[i_roi]]*npix
             if self.Hi is not None:
-                self.all_nominal_hkl += [tuple(self.Hi[self.refls_idx[i_roi]])]*npix
-                self.hi_asu_perpix += [self.Hi_asu[self.refls_idx[i_roi]]] * npix
+                self.all_nominal_hkl += [tuple(self.Hi[i_roi])]*npix
+                self.hi_asu_perpix += [self.Hi_asu[i_roi]] * npix
+                #self.all_nominal_hkl += [tuple(self.Hi[i_roi])]*npix
+                #self.hi_asu_perpix += [self.Hi_asu[i_roi]] * npix
 
         if self.params.roi.mask_outside_trusted_range:
             MAIN_LOGGER.debug("Found %d pixels outside of trusted range" % numOutOfRange)
@@ -640,37 +752,32 @@ def dump_gathered_to_refl(self, output_name, do_xyobs_sanity_check=False):
             shoeboxes.append(sb)
 
         R['shoebox'] = flex.shoebox(shoeboxes)
+        R['id'] = flex.int(len(R), 0)
         R.as_file(output_name)
 
     def set_parameters_for_experiment(self, best=None):
+        if self.params.symmetrize_Flatt and not self.params.fix.eta_abc:
+            if not self.params.simulator.crystal.has_isotropic_mosaicity:
+                raise NotImplementedError("if fix.eta_abc=False and symmetrize_Flatt=True, then eta must be isotropic. Set simulator.crystal.has_isotropic_mosaicity=True")
         ParameterTypes = {"ranged": RangedParameter, "positive": PositiveParameter}
         ParameterType = RangedParameter  # most params currently only this type
 
+        if self.params.centers.Nvol is not None:
+            assert self.params.betas.Nvol is not None
+
         if best is not None:
             # set the crystal Umat (rotational displacement) and Bmat (unit cell)
             # Umatrix
             # NOTE: just set the best Amatrix here
-            if self.params.apply_best_crystal_model:
-                xax = col((-1, 0, 0))
-                yax = col((0, -1, 0))
-                zax = col((0, 0, -1))
-                rotX, rotY, rotZ = best[["rotX", "rotY", "rotZ"]].values[0]
-                RX = xax.axis_and_angle_as_r3_rotation_matrix(rotX, deg=False)
-                RY = yax.axis_and_angle_as_r3_rotation_matrix(rotY, deg=False)
-                RZ = zax.axis_and_angle_as_r3_rotation_matrix(rotZ, deg=False)
-                M = RX * RY * RZ
-                U = M * sqr(self.E.crystal.get_U())
-                self.E.crystal.set_U(U)
-
-                # Bmatrix:
-                ucparam = best[["a","b","c","al","be","ga"]].values[0]
-                ucman = utils.manager_from_params(ucparam)
-                self.E.crystal.set_B(ucman.B_recipspace)
+            #C = deepcopy(self.E.crystal)
+            #crystal = self.E.crystal
+            #self.E.crystal = crystal
 
             ## TODO , currently need this anyway
             ucparam = best[["a","b","c","al","be","ga"]].values[0]
             ucman = utils.manager_from_params(ucparam)
             self.E.crystal.set_B(ucman.B_recipspace)
+            self.E.crystal.set_A(best.Amats.values[0])
 
             # mosaic block
             self.params.init.Nabc = tuple(best.ncells.values[0])
@@ -693,9 +800,6 @@ def set_parameters_for_experiment(self, best=None):
         maxs = self.params.maxs
         centers = self.params.centers
         betas = self.params.betas
-        if not self.params.use_restraints or self.params.fix.ucell:
-            centers.ucell = [1,1,1,1,1,1]
-            betas.ucell = [1,1,1,1,1,1]
         fix = self.params.fix
         types = self.params.types
         P = Parameters()
@@ -707,7 +811,8 @@ def set_parameters_for_experiment(self, best=None):
                 p = ParameterType(init=0, sigma=sigma.RotXYZ[ii],
                                   minval=mins.RotXYZ[ii], maxval=maxs.RotXYZ[ii],
                                   fix=fix.RotXYZ, name="RotXYZ%d_xtal%d" % (ii,i_xtal),
-                                  center=centers.RotXYZ[ii], beta=betas.RotXYZ)
+                                  center=0 if betas.RotXYZ is not None else None,
+                                  beta=betas.RotXYZ)
                 P.add(p)
 
             p = ParameterTypes[types.G](init=init.G + init.G*0.01*i_xtal, sigma=sigma.G,
@@ -746,33 +851,38 @@ def set_parameters_for_experiment(self, best=None):
             p = ParameterTypes[types.Nabc](init=init.Nabc[ii], sigma=sigma.Nabc[ii],
                               minval=mins.Nabc[ii], maxval=maxs.Nabc[ii],
                               fix=fix_Nabc[ii], name="Nabc%d" % (ii,),
-                              center=centers.Nabc[ii], beta=betas.Nabc[ii])
+                              center=centers.Nabc[ii] if centers.Nabc is not None else None,
+                              beta=betas.Nabc[ii] if betas.Nabc is not None else None)
             P.add(p)
 
             p = ParameterType(init=init.Ndef[ii], sigma=sigma.Ndef[ii],
                               minval=mins.Ndef[ii], maxval=maxs.Ndef[ii],
                               fix=fix.Ndef, name="Ndef%d" % (ii,),
-                              center=centers.Ndef[ii], beta=betas.Ndef[ii])
+                              center=centers.Ndef[ii] if centers.Ndef is not None else None,
+                              beta=betas.Ndef[ii] if betas.Ndef is not None else None)
             P.add(p)
 
             # diffuse gamma and sigma
             p = ParameterTypes[types.diffuse_gamma](init=init.diffuse_gamma[ii], sigma=sigma.diffuse_gamma[ii],
                               minval=mins.diffuse_gamma[ii], maxval=maxs.diffuse_gamma[ii],
                               fix=fix_difgam[ii], name="diffuse_gamma%d" % (ii,),
-                              center=centers.diffuse_gamma[ii], beta=betas.diffuse_gamma[ii])
+                              center=centers.diffuse_gamma[ii] if centers.diffuse_gamma is not None else None,
+                              beta=betas.diffuse_gamma[ii] if betas.diffuse_gamma is not None else None)
             P.add(p)
 
             p = ParameterTypes[types.diffuse_sigma](init=init.diffuse_sigma[ii], sigma=sigma.diffuse_sigma[ii],
                               minval=mins.diffuse_sigma[ii], maxval=maxs.diffuse_sigma[ii],
                               fix=fix_difsig[ii], name="diffuse_sigma%d" % (ii,),
-                              center=centers.diffuse_sigma[ii], beta=betas.diffuse_sigma[ii])
+                              center=centers.diffuse_sigma[ii] if centers.diffuse_sigma is not None else None,
+                              beta=betas.diffuse_sigma[ii] if betas.diffuse_sigma is not None else None)
             P.add(p)
 
             # mosaic spread (mosaicity)
             p = ParameterType(init=init.eta_abc[ii], sigma=sigma.eta_abc[ii],
                               minval=mins.eta_abc[ii], maxval=maxs.eta_abc[ii],
                               fix=fix_eta[ii], name="eta_abc%d" % (ii,),
-                              center=centers.eta_abc[ii], beta=betas.eta_abc[ii])
+                              center=centers.eta_abc[ii] if centers.eta_abc is not None else None,
+                              beta=betas.eta_abc[ii] if betas.eta_abc is not None else None)
             P.add(p)
 
         ucell_man = utils.manager_from_crystal(self.E.crystal)
@@ -781,40 +891,29 @@ def set_parameters_for_experiment(self, best=None):
             if "Ang" in name:
                 minval = val - ucell_vary_perc * val
                 maxval = val + ucell_vary_perc * val
-                if centers.ucell is not None:
-                    cent = centers.ucell[i_uc]
-                    beta = betas.ucell[i_uc]
+                if name == 'a_Ang':
+                    cent = centers.ucell_a
+                    beta = betas.ucell_a
+                elif name== 'b_Ang':
+                    cent = centers.ucell_b
+                    beta = betas.ucell_b
                 else:
-                    if name == 'a_Ang':
-                        cent = centers.ucell_a
-                        beta = betas.ucell_a
-                    elif name== 'b_Ang':
-                        cent = centers.ucell_b
-                        beta = betas.ucell_b
-                    else:
-                        cent = centers.ucell_c
-                        beta = betas.ucell_c
-                    assert cent is not None, "Set the center restraints properly!"
-                    assert beta is not None
+                    cent = centers.ucell_c
+                    beta = betas.ucell_c
             else:
                 val_in_deg = val * 180 / np.pi
                 minval = (val_in_deg - self.params.ucell_ang_abs) * np.pi / 180.
                 maxval = (val_in_deg + self.params.ucell_ang_abs) * np.pi / 180.
-                if centers.ucell is not None:
-                    cent = centers.ucell[i_uc]*np.pi / 180.
-                    beta = betas.ucell[i_uc]
+                if name=='alpha_rad':
+                    cent = centers.ucell_alpha
+                    beta = betas.ucell_alpha
+                elif name=='beta_rad':
+                    cent = centers.ucell_beta
+                    beta = betas.ucell_beta
                 else:
-                    if name=='alpha_rad':
-                        cent = centers.ucell_alpha
-                        beta = betas.ucell_alpha
-                    elif name=='beta_rad':
-                        cent = centers.ucell_beta
-                        beta = betas.ucell_beta
-                    else:
-                        cent = centers.ucell_gamma
-                        beta = betas.ucell_gamma
-                    assert cent is not None
-                    assert beta is not None
+                    cent = centers.ucell_gamma
+                    beta = betas.ucell_gamma
+                if cent is not None:
                     cent = cent*np.pi / 180.
 
             p = ParameterType(init=val, sigma=sigma.ucell[i_uc],
@@ -860,11 +959,13 @@ def set_parameters_for_experiment(self, best=None):
         # two parameters for optimizing the spectrum
         p = RangedParameter(init=self.params.init.spec[0], sigma=self.params.sigmas.spec[0],
                             minval=mins.spec[0], maxval=maxs.spec[0], fix=fix.spec,
-                            name="lambda_offset", center=centers.spec[0], beta=betas.spec[0])
+                            name="lambda_offset", center=centers.spec[0] if centers.spec is not None else None,
+                            beta=betas.spec[0] if betas.spec is not None else None)
         P.add(p)
         p = RangedParameter(init=self.params.init.spec[1], sigma=self.params.sigmas.spec[1],
                             minval=mins.spec[1], maxval=maxs.spec[1], fix=fix.spec,
-                            name="lambda_scale", center=centers.spec[1], beta=betas.spec[1])
+                            name="lambda_scale", center=centers.spec[1] if centers.spec is not None else None,
+                            beta=betas.spec[1] if betas.spec is not None else None)
         P.add(p)
 
         # iterating over this dict is time-consuming when refinine Fhkl, so we split up the names here:
@@ -872,6 +973,11 @@ def set_parameters_for_experiment(self, best=None):
         self.scale_roi_names = [name for name in P if name.startswith("scale_roi")]
         self.P = P
 
+        for name in self.P:
+            p = self.P[name]
+            if (p.beta is not None and p.center is None) or (p.center is not None and p.beta is None):
+                raise RuntimeError("To use restraints, must specify both center and beta for param %s" % name)
+
     def get_data_model_pairs(self, reorder=False):
         if self.best_model is None:
             raise ValueError("cannot get the best model, there is no best_model attribute")
@@ -923,7 +1029,7 @@ def get_data_model_pairs(self, reorder=False):
 
         return ret_subimgs
 
-    def Minimize(self, x0, SIM, i_exp=0):
+    def Minimize(self, x0, SIM, i_shot=0):
         self.target = target = TargetFunc(SIM=SIM, niter_per_J=self.params.niter_per_J, profile=self.params.profile)
 
         # set up the refinement flags
@@ -957,7 +1063,7 @@ def Minimize(self, x0, SIM, i_exp=0):
             assert self.params.hopper_save_freq is None
             at_min = self.at_minimum
 
-        callback_kwargs = {"SIM":SIM, "i_exp": i_exp, "save_freq": self.params.hopper_save_freq}
+        callback_kwargs = {"SIM":SIM, "i_shot": i_shot, "save_freq": self.params.hopper_save_freq}
         callback = lambda x: self.callback(x, callback_kwargs)
         target.terminate_after_n_converged_iterations = self.params.terminate_after_n_converged_iter
         target.percent_change_of_converged = self.params.converged_param_percent_change
@@ -1035,13 +1141,13 @@ def Minimize(self, x0, SIM, i_exp=0):
 
     def callback(self, x, kwargs):
         save_freq = kwargs["save_freq"]
-        i_exp = kwargs["i_exp"]
+        i_shot = kwargs["i_shot"]
         SIM = kwargs["SIM"]
         target = self.target
         if save_freq is not None and target.iteration % save_freq==0 and target.iteration> 0:
             xall = target.x0.copy()
             xall[target.vary] = x
-            self.save_up(xall, SIM, rank=self.rank, i_exp=i_exp)
+            self.save_up(xall, SIM, rank=self.rank, i_shot=i_shot)
         return
 
         rescaled_vals = np.zeros_like(xall)
@@ -1094,30 +1200,34 @@ def callback(self, x, kwargs):
             # at this point prev_iter_vals are the converged parameters!
             raise StopIteration()  # Refinement has reached convergence!
 
-    def save_up(self, x, SIM, rank=0, i_exp=0,
+    def save_up(self, x, SIM, rank=0, i_shot=0,
                 save_fhkl_data=True, save_modeler_file=True,
                 save_refl=True,
-                save_sim_info=True):
+                save_sim_info=True,
+                save_traces=True,
+                save_pandas=True, save_expt=True):
         """
 
-        :param x:
-        :param SIM:
-        :param rank:
-        :param i_exp:
-        :param save_fhkl_data:
-        :param save_modeler_file:
-        :param save_refl:
-        :param save_sim_info:
-        :return:
+        :param x: l-bfgs refinement parameters (reparameterized, e.g. unbounded)
+        :param SIM: sim_data.SimData instance
+        :param rank: MPI rank Id
+        :param i_shot: shot index for this rank (assuming each rank processes more than one shot, this should increment)
+        :param save_fhkl_data: whether to write mtz files
+        :param save_modeler_file: whether to write the DataModeler .npy file (a pickle file)
+        :param save_refl: whether to write a reflection table for this shot with updated xyzcal.px from diffBragg models
+        :param save_sim_info: whether to write a text file showing the diffBragg state
+        :param save_traces: whether to write a text file showing the refinement target functional and sigmaZ per iter
+        :param save_pandas: whether to write a single-shot pandas dataframe containing optimized diffBragg params
+        :param save_expt: whether to save a single-shot experiment file for this shot with optimized crystal model
+        :return: returns the single shot pandas dataframe (whether or not it was written)
         """
-        # TODO optionally create directories
         assert self.exper_name is not None
         assert self.refl_name is not None
         Modeler = self
         LOGGER = logging.getLogger("refine")
         Modeler.best_model, _ = model(x, Modeler, SIM,  compute_grad=False)
         Modeler.best_model_includes_background = False
-        LOGGER.info("Optimized values for i_exp %d:" % i_exp)
+        LOGGER.info("Optimized values for i_shot %d:" % i_shot)
 
         basename = os.path.splitext(os.path.basename(self.exper_name))[0]
 
@@ -1170,26 +1280,30 @@ def save_up(self, x, SIM, rank=0, i_exp=0,
                     scale_facs.append(scale_fac)
                     scale_vars.append(scale_var)
                     is_nominal_hkl.append(asu in all_nominal_hkl)
-                scale_fname = os.path.join(fhkl_scale_dir, "%s_%s_%d_channel%d_scale.npz"\
-                                     % (Modeler.params.tag, basename, i_exp, i_chan))
+                scale_fname = os.path.join(fhkl_scale_dir, "%s_%s_%d_%d_channel%d_scale.npz"\
+                                     % (Modeler.params.tag, basename, i_shot, self.exper_idx, i_chan))
                 np.savez(scale_fname, asu_hkl=asu_hkls, scale_fac=scale_facs, scale_var=scale_vars,
                          is_nominal_hkl=is_nominal_hkl)
 
 
         # TODO: pretty formatting ?
         if Modeler.target is not None:
-            rank_trace_outdir = hopper_io.make_rank_outdir(Modeler.params.outdir, "traces", rank)
-            trace_path = os.path.join(rank_trace_outdir, "%s_%s_%d_traces.txt" % (Modeler.params.tag, basename, i_exp))
             # hop number, gradient descent index (resets with each new hop), target functional
             trace0, trace1, trace2 = Modeler.target.all_hop_id, Modeler.target.all_f, Modeler.target.all_sigZ
-
             trace_data = np.array([trace0, trace1, trace2]).T
-            np.savetxt(trace_path, trace_data, fmt="%s")
+
+            if save_traces:
+                rank_trace_outdir = hopper_io.make_rank_outdir(Modeler.params.outdir, "traces", rank)
+                trace_path = os.path.join(rank_trace_outdir, "%s_%s_%d_%d_traces.txt"
+                                          % (Modeler.params.tag, basename, i_shot, self.exper_idx))
+                np.savetxt(trace_path, trace_data, fmt="%s")
 
             Modeler.niter = len(trace0)
             Modeler.sigz = trace2[-1]
 
-        hopper_io.save_to_pandas(x, Modeler, SIM, self.exper_name, Modeler.params, Modeler.E, i_exp, self.refl_name, None, rank)
+        shot_df = hopper_io.save_to_pandas(x, Modeler, SIM, self.exper_name, Modeler.params, Modeler.E, i_shot,
+                                           self.refl_name, None, rank, write_expt=save_expt, write_pandas=save_pandas,
+                                           exp_idx=self.exper_idx)
 
         if isinstance(Modeler.all_sigma_rdout, np.ndarray):
             data_subimg, model_subimg, trusted_subimg, bragg_subimg, sigma_rdout_subimg = Modeler.get_data_model_pairs()
@@ -1208,7 +1322,8 @@ def save_up(self, x, SIM, rank=0, i_exp=0,
 
         if save_refl:
             rank_refls_outdir = hopper_io.make_rank_outdir(Modeler.params.outdir, "refls", rank)
-            new_refls_file = os.path.join(rank_refls_outdir, "%s_%s_%d.refl" % (Modeler.params.tag, basename, i_exp))
+            new_refls_file = os.path.join(rank_refls_outdir, "%s_%s_%d_%d.refl"
+                                          % (Modeler.params.tag, basename, i_shot, self.exper_idx))
             new_refls = deepcopy(Modeler.refls)
             has_xyzcal = 'xyzcal.px' in list(new_refls.keys())
             if has_xyzcal:
@@ -1268,19 +1383,24 @@ def save_up(self, x, SIM, rank=0, i_exp=0,
         if save_modeler_file:
             rank_imgs_outdir = hopper_io.make_rank_outdir(Modeler.params.outdir, "imgs", rank)
             modeler_file = os.path.join(rank_imgs_outdir,
-                                        "%s_%s_%d_modeler.npy" % (Modeler.params.tag, basename, i_exp))
+                                        "%s_%s_%d_%d_modeler.npy"
+                                        % (Modeler.params.tag, basename, i_shot, self.exper_idx))
             np.save(modeler_file, Modeler)
         if save_sim_info:
             spectrum_file = os.path.join(rank_imgs_outdir,
-                                         "%s_%s_%d_spectra.lam" % (Modeler.params.tag, basename, i_exp))
+                                         "%s_%s_%d_%d_spectra.lam"
+                                         % (Modeler.params.tag, basename, i_shot, self.exper_idx))
             rank_SIMlog_outdir = hopper_io.make_rank_outdir(Modeler.params.outdir, "simulator_state", rank)
-            SIMlog_path = os.path.join(rank_SIMlog_outdir, "%s_%s_%d.txt" % (Modeler.params.tag, basename, i_exp))
+            SIMlog_path = os.path.join(rank_SIMlog_outdir, "%s_%s_%d_%d.txt"
+                                       % (Modeler.params.tag, basename, i_shot, self.exper_idx))
             write_SIM_logs(SIM, log=SIMlog_path, lam=spectrum_file)
 
         if Modeler.params.refiner.debug_pixel_panelfastslow is not None:
             # TODO separate diffBragg logger
             utils.show_diffBragg_state(SIM.D, Modeler.params.refiner.debug_pixel_panelfastslow)
 
+        return shot_df
+
 
 def convolve_model_with_psf(model_pix, J, mod, SIM, PSF=None, psf_args=None,
         roi_id_slices=None, roi_id_unique=None):
@@ -1339,7 +1459,24 @@ def convolve_model_with_psf(model_pix, J, mod, SIM, PSF=None, psf_args=None,
     return model_pix, J
 
 
-def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_spectrum=False):
+def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_spectrum=False,
+          update_Fhkl_scales=True):
+
+    if Mod.params.logging.parameters:
+        val_s = ""
+        for p in Mod.P.values():
+            if p.name.startswith("Fhkl_"):
+                continue
+            if p.refine:
+                xval = x[p.xpos]
+                val = p.get_val(xval)
+                name = p.name
+                if name == "detz_shift":
+                    val = val * 1e3
+                    name = p.name + "_mm"
+                val_s += "%s=%.3f, " % (name, val)
+        MAIN_LOGGER.debug(val_s)
+
 
     pfs = Mod.pan_fast_slow
 
@@ -1351,7 +1488,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
         if Mod.Fhkl_channel_ids is not None:
             SIM.D.update_Fhkl_channels(Mod.Fhkl_channel_ids)
 
-    if SIM.refining_Fhkl:  # once per iteration
+    if SIM.refining_Fhkl and update_Fhkl_scales:  # once per iteration
         nscales = SIM.Num_ASU*SIM.num_Fhkl_channels
         current_Fhkl_xvals = x[-nscales:]
         SIM.Fhkl_scales = SIM.Fhkl_scales_init * np.exp( Mod.params.sigmas.Fhkl *(current_Fhkl_xvals-1))
@@ -1436,6 +1573,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
         J = np.zeros((nparam-SIM.Num_ASU*SIM.num_Fhkl_channels, npix))  # gradients
 
     model_pix = None
+    #TODO check roiScales mode and if its broken, git rid of it!
     model_pix_noRoi = None
 
     # extract the scale factors per ROI, these might correspond to structure factor intensity scale factors, and quite possibly might result in overfits!
@@ -1462,6 +1600,16 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
         SIM.D.set_value(ROTY_ID, rotY)
         SIM.D.set_value(ROTZ_ID, rotZ)
 
+        if Mod.params.symmetrize_Flatt:
+            RXYZU = hopper_io.diffBragg_Umat(rotX, rotY, rotZ, SIM.D.Umatrix)
+            Cryst = deepcopy(SIM.crystal.dxtbx_crystal)
+            A = RXYZU * Mod.ucell_man.B_realspace
+            A_recip = A.inverse().transpose()
+            Cryst.set_A(A_recip)
+            symbol = SIM.crystal.space_group_info.type().lookup_symbol()
+            SIM.D.set_mosaic_blocks_sym(Cryst, symbol , Mod.params.simulator.crystal.num_mosaicity_samples,
+                                        refining_eta=not Mod.params.fix.eta_abc)
+
         G = Mod.P["G_xtal%d" % i_xtal]
         scale = G.get_val(x[G.xpos])
 
@@ -1663,7 +1811,15 @@ def __call__(self, x, *args, **kwargs):
         self.all_x.append(self.x0)
 
         mod, SIM, compute_grad = args
-        f, g, modelpix, J, sigZ, debug_s = target_func(self.x0, update_terms, mod, SIM, compute_grad)
+        f, g, modelpix, J, sigZ, debug_s, zscore_perpix = target_func(self.x0, update_terms, mod, SIM, compute_grad,
+                                                                      return_all_zscores=True)
+        mod.all_zscore = zscore_perpix
+
+        # filter during refinement?
+        if mod.params.filter_during_refinement.enable and self.iteration > 0:
+            if self.iteration % mod.params.filter_during_refinement.after_n == 0:
+                mod.filter_pixels(thresh=mod.params.filter_during_refinement.threshold)
+
 
         self.t_per_iter.append(time.time())
         if len(self.t_per_iter) > 2:
@@ -1683,7 +1839,7 @@ def __call__(self, x, *args, **kwargs):
         return f
 
 
-def target_func(x, udpate_terms, mod, SIM, compute_grad=True):
+def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores=False):
     pfs = mod.pan_fast_slow
     data = mod.all_data
     sigma_rdout = mod.all_sigma_rdout
@@ -1757,15 +1913,17 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True):
     fLogLike = fLogLike[trusted].sum()   # negative log Likelihood target
 
     # width of z-score should decrease as refinement proceeds
-    zscore_sigma = np.std( (resid / np.sqrt(V))[trusted])
+    zscore_per = resid/np.sqrt(V)
+    zscore_sigma = np.std(zscore_per[trusted])
 
     restraint_terms = {}
     if params.use_restraints:
         # scale factor restraint
         for name in mod.non_fhkl_params:
             p = mod.P[name]
-            val = p.get_restraint_val(x[p.xpos])
-            restraint_terms[name] = val
+            if p.beta is not None:
+                val = p.get_restraint_val(x[p.xpos])
+                restraint_terms[name] = val
 
         if params.centers.Nvol is not None:
             na,nb,nc = SIM.D.Ncells_abc_aniso
@@ -1833,14 +1991,16 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True):
             # update gradients according to restraints
             for name in mod.non_fhkl_params:
                 p = mod.P[name]
-                g[p.xpos] += p.get_restraint_deriv(x[p.xpos])
+                if p.beta is not None:
+                    g[p.xpos] += p.get_restraint_deriv(x[p.xpos])
 
-            if not params.fix.perRoiScale:
+            if not params.fix.perRoiScale:  # deprecated ?
                 for name in mod.scale_roi_names:
                     p = mod.P[name]
-                    g[p.xpos] += p.get_restraint_deriv(x[p.xpos])
+                    if p.beta is not None:
+                        g[p.xpos] += p.get_restraint_deriv(x[p.xpos])
 
-            if params.centers.Nvol is not None:
+            if params.betas.Nvol is not None:
                 Nmat_inv = np.linalg.inv(Nmat)
                 dVol_dN_vals = []
                 for i_N in range(6):
@@ -1900,7 +2060,11 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True):
 
     debug_s = "F=%10.7g sigZ=%10.7g (Fracs of F: %s), |g|=%10.7g" \
               % (f, zscore_sigma, restraint_debug_s, gnorm)
-    return f, g, model_bragg, Jac, zscore_sigma, debug_s
+
+    return_data = f, g, model_bragg, Jac, zscore_sigma, debug_s
+    if return_all_zscores:
+        return_data += (zscore_per,)
+    return return_data
 
 
 def refine(exp, ref, params, spec=None, gpu_device=None, return_modeler=False, best=None, free_mem=True):
@@ -2117,13 +2281,30 @@ def generate_gauss_spec(central_en=9500, fwhm=10, res=1, nchan=20, total_flux=1e
     else:
         return ens, wt
 
+def downsamp_spec_from_params(params, expt=None, imgset=None, i_img=0):
+    """
+
+    :param params:  hopper phil params extracted
+    :param expt: a dxtbx experiment (optional)
+    :param imgset: an dxtbx imageset (optional)
+    :param i_img: index of the image in the imageset (only matters if imgset is not None)
+    :return: dxtbx spectrum with parameters applied
+    """
+    if expt is not None:
+        dxtbx_spec = expt.imageset.get_spectrum(0)
+        starting_wave = expt.beam.get_wavelength()
+    else:
+        assert imgset is not None
+        dxtbx_spec = imgset.get_spectrum(i_img)
+        starting_wave = imgset.get_beam(i_img).get_wavelength()
 
-def downsamp_spec_from_params(params, expt):
-    dxtbx_spec = expt.imageset.get_spectrum(0)
     spec_en = dxtbx_spec.get_energies_eV()
     spec_wt = dxtbx_spec.get_weights()
     if params.downsamp_spec.skip:
         spec_wave = utils.ENERGY_CONV / spec_en.as_numpy_array()
+        stride=params.simulator.spectrum.stride
+        spec_wave = spec_wave[::stride]
+        spec_wt = spec_wt[::stride]
         spectrum = list(zip(spec_wave, spec_wt))
     else:
         spec_en = dxtbx_spec.get_energies_eV()
@@ -2149,11 +2330,12 @@ def downsamp_spec_from_params(params, expt):
         downsamp_wave = utils.ENERGY_CONV / downsamp_en
         spectrum = list(zip(downsamp_wave, downsamp_wt))
     # the nanoBragg beam has an xray_beams property that is used internally in diffBragg
-    starting_wave = expt.beam.get_wavelength()
     waves, specs = map(np.array, zip(*spectrum))
     ave_wave = sum(waves*specs) / sum(specs)
-    expt.beam.set_wavelength(ave_wave)
-    MAIN_LOGGER.debug("Shifting wavelength from %f to %f" % (starting_wave, ave_wave))
+    MAIN_LOGGER.debug("Starting wavelength=%f. Spectrum ave wavelength=%f" % (starting_wave, ave_wave))
+    if expt is not None:
+        expt.beam.set_wavelength(ave_wave)
+        MAIN_LOGGER.debug("Shifting expt wavelength from %f to %f" % (starting_wave, ave_wave))
     MAIN_LOGGER.debug("USING %d ENERGY CHANNELS" % len(spectrum))
     return spectrum
 
@@ -2165,6 +2347,9 @@ def downsamp_spec(SIM, params, expt, return_and_dont_set=False):
     spec_wt = SIM.dxtbx_spec.get_weights()
     if params.downsamp_spec.skip:
         spec_wave = utils.ENERGY_CONV / spec_en.as_numpy_array()
+        stride = params.simulator.spectrum.stride
+        spec_wave = spec_wave[::stride]
+        spec_wt = spec_wt[::stride]
         SIM.beam.spectrum = list(zip(spec_wave, spec_wt))
     else:
         spec_en = SIM.dxtbx_spec.get_energies_eV()
@@ -2195,6 +2380,7 @@ def downsamp_spec(SIM, params, expt, return_and_dont_set=False):
     ave_wave = sum(waves*specs) / sum(specs)
     expt.beam.set_wavelength(ave_wave)
     MAIN_LOGGER.debug("Shifting wavelength from %f to %f" % (starting_wave, ave_wave))
+    MAIN_LOGGER.debug("Using %d energy channels" % len(SIM.beam.spectrum))
     if return_and_dont_set:
         return SIM.beam.spectrum
     else:
@@ -2234,12 +2420,12 @@ def set_gauss_spec(SIM=None, params=None, E=None):
 
 def sanity_test_input_lines(input_lines):
     for line in input_lines:
-        line_fields = line.strip().split()
-        if len(line_fields) not in [2, 3]:
+        line_items = line.strip().split()
+        if len(line_items) not in [2, 3, 4]:
             raise IOError("Input line %s is not formatted properly" % line)
-        for fname in line_fields:
-            if not os.path.exists(fname):
-                raise FileNotFoundError("File %s does not exist" % fname)
+        for item in line_items:
+            if os.path.isfile(item) and not os.path.exists(item):
+                raise FileNotFoundError("File %s does not exist" % item)
 
 
 def full_img_pfs(img_sh):
@@ -2304,10 +2490,19 @@ def get_simulator_for_data_modelers(data_modeler):
         if self.params.diffuse_stencil_size > 0:
             SIM.D.stencil_size = self.params.diffuse_stencil_size
             MAIN_LOGGER.debug("Set diffuse stencil size: %d" % SIM.D.stencil_size)
+        if self.params.diffuse_orientation == 1:
+            ori = (1,0,0,0,1,0,0,0,1)
+        else:
+            a = 1/np.sqrt(2)
+            ori = a, a, 0.0, a, a, 0.0, 0.0, 0.0, 1.0
+        SIM.D.set_rotate_principal_axes(ori)
     SIM.D.gamma_miller_units = self.params.gamma_miller_units
     SIM.isotropic_diffuse_gamma = self.params.isotropic.diffuse_gamma
     SIM.isotropic_diffuse_sigma = self.params.isotropic.diffuse_sigma
+    if self.params.record_device_timings:
+        SIM.D.record_timings = True
 
+    # TODO: use data_modeler.set_spectrum instead
     if self.params.spectrum_from_imageset:
         downsamp_spec(SIM, self.params, self.E)
     elif self.params.gen_gauss_spec:
diff --git a/simtbx/diffBragg/mpi_logger.py b/simtbx/diffBragg/mpi_logger.py
index 8900ea8d54..c81cffd4d9 100644
--- a/simtbx/diffBragg/mpi_logger.py
+++ b/simtbx/diffBragg/mpi_logger.py
@@ -20,7 +20,7 @@
 
 LEVELS = {"low": logging.WARNING, "normal": logging.INFO, "high": logging.DEBUG}
 DETAILED_FORMAT = 'RANK%d:%s | ' % (COMM.rank, HOST) + '%(asctime)s | %(filename)s:%(funcName)s >>  %(message)s'
-SIMPLE_FORMAT = "RANK%04d "%(COMM.rank)+"%(message)s"
+SIMPLE_FORMAT = "RANK%04d "%(COMM.rank)+"| %(asctime)s | %(message)s"
 
 from simtbx.diffBragg import utils
 
@@ -61,14 +61,16 @@ def setup_logging_from_params(params):
             utils.safe_makedirs(params.outdir)
         COMM.barrier()
         main_level = LEVELS[params.logging.logfiles_level]
+        logfile_name = params.logging.log_hostname*(HOST+"-") + params.logging.logname
         main_logger = _make_logger("diffBragg.main",
-                                  os.path.join(params.outdir, HOST+"-"+params.logging.logname),
+                                  os.path.join(params.outdir, logfile_name),
                                   level=main_level,
                                   overwrite=params.logging.overwrite,
                                   formatter=logging.Formatter(DETAILED_FORMAT))
 
+        profile_name = params.logging.log_hostname*(HOST+"-") + params.profile_name
         _make_logger("diffBragg.profile",
-                    os.path.join(params.outdir, HOST+"-"+params.profile_name),
+                    os.path.join(params.outdir, profile_name),
                     level=logging.INFO,
                     overwrite=params.logging.overwrite,
                     formatter=logging.Formatter(SIMPLE_FORMAT))
diff --git a/simtbx/diffBragg/phil.py b/simtbx/diffBragg/phil.py
index 806780f47c..b6558f77fb 100644
--- a/simtbx/diffBragg/phil.py
+++ b/simtbx/diffBragg/phil.py
@@ -6,6 +6,53 @@
 #'''
 
 hopper_phil = """
+
+filter_during_refinement {
+  enable = False
+    .type = bool
+    .help = if True, filtering will occur each N iterations, controlled by parameter after_n
+  after_n = 50
+    .type = int
+    .help = refiner will pause and check for outliers every after_n iterations
+  threshold = 20
+    .type = float
+    .help = outliers are detected by looking at the distribution of per shoebox sigmaZ
+    .help = and then using a median absolute deviation filter. Lower values of threshold will flag more pixels as outliers
+}
+
+filter_after_refinement {
+  enable = False
+    .type = bool
+    .help = if True, filter, then rerun refinement if certain conditions are met (e.g. too few refinement iterations)
+  max_attempts = 2
+    .type = int
+    .help = how many additional times to run hopper
+  min_prev_niter = 50
+    .type = int
+    .help = only repeat if the previous refinement was fewer than this many iterations
+  max_prev_sigz = 10
+    .type = float
+    .help = only repeat if the previous refinement had sigma Z more than this
+  threshold = 20
+    .type = float
+    .help = outliers are detected by looking at the distribution of per shoebox sigmaZ
+    .help = and then using a median absolute deviation filter. Lower values of threshold will flag more pixels as outliers
+}
+
+symmetrize_Flatt = False
+  .type = bool
+  .help = If True, add 3-fold symmetric mosaic blocks to the calculation of F_latt
+record_device_timings = False
+  .type = bool
+  .help = Record the execution times of diffBragg host-dev copies and kernel executions
+  .help = the results will be printed to the terminal
+consider_multicrystal_shots = False
+  .type = bool
+  .help = If True, and if there are multiple crystals in the experiment list,
+  .help = then try to model all crystals for a given shot.
+debug_mode = False
+  .type = bool
+  .help = If True, many output files are written to explore the diffBragg models in great detail
 nominal_Fhkl_only = True
   .type = bool
   .help = if refining Fhkls, only refine the ones that are assigned to a reflection table...
@@ -121,12 +168,6 @@
     .help = final resolution of downsampled spectrum in eV
     .expert_level=0
 }
-apply_best_crystal_model = False
-  .type = bool
-  .help = depending on what experiments in the exper refl file, one may want
-  .help = to apply the optimal crystal transformations (this parameter only matters
-  .help = if params.best_pickle is not None)
-  .expert_level=10
 filter_unpredicted_refls_in_output = True
   .type = bool
   .help = filter reflections in the output refl table for which there was no model bragg peak
@@ -192,41 +233,41 @@
   ucell_gamma = None
     .type = float
     .help = restraint variance for unit cell gamma angle
-  Nvol = 1e8
+  Nvol = None
     .type = float
     .help = tightness of the Nabc volume contraint
-  detz_shift = 1e8
+  detz_shift = None
     .type = float
     .help = restraint variance for detector shift target
-  ucell = [1e8,1e8,1e8,1e8,1e8,1e8]
+  ucell = None
     .type = floats
     .help = DEPRECATED: use e.g. betas.ucell_a instead
     .help = variances for unit cell constants in order determined by unit cell manager class (see diffBragg/refiners/crystal_systems)
-  RotXYZ = 1e8
+  RotXYZ = None
     .type = float
     .help = restraint factor for the rotXYZ restraint
-  Nabc = [1e8,1e8,1e8]
+  Nabc = None
     .type = floats(size=3)
     .help = restraint factor for the ncells abc
-  Ndef = [1e8,1e8,1e8]
+  Ndef = None
     .type = floats(size=3)
     .help = restraint factor for the ncells def
-  diffuse_sigma = 1e8,1e8,1e8
+  diffuse_sigma = None
     .type = floats(size=3)
     .help = restraint factor for diffuse sigma
-  diffuse_gamma = 1e8,1e8,1e8
+  diffuse_gamma = None
     .type = floats(size=3)
     .help = restraint factor for diffuse gamma
-  G = 1e8
+  G = None
     .type = float
     .help = restraint factor for the scale G
-  B = 1e8
+  B = None
     .type = float
     .help = restraint factor for Bfactor
-  eta_abc = [1e8,1e8,1e8]
+  eta_abc = None
     .type = floats(size=3)
     .help = restrain factor for mosaic spread angles
-  spec = [1e8,1e8]
+  spec = None
     .type = floats(size=2)
     .help = restraint factor for spectrum coefs
   Fhkl = None
@@ -276,38 +317,34 @@
   Nvol = None
     .type = float
     .help = if provided, constrain the product Na*Nb*Nc to this value
-  detz_shift = 0
+  detz_shift = None
     .type = float
     .help = restraint target for detector shift along z-direction
-  ucell = [63.66, 28.87, 35.86, 1.8425]
-    .type = floats
-    .help = DEPRECATED: use e.g. betas.ucell_a instead
-    .help = centers for unit cell constants in order determined by unit cell manager class (see diffBragg/refiners/crystal_systems)
-  RotXYZ = [0,0,0]
+  RotXYZ = None
     .type = floats(size=3)
     .help = restraint target for Umat rotations
-  Nabc = [100,100,100]
+  Nabc = None
     .type = floats(size=3)
     .help = restraint target for Nabc
-  Ndef = [0,0,0]
+  Ndef = None
     .type = floats(size=3)
     .help = restraint target for Ndef
-  diffuse_sigma = [1,1,1]
+  diffuse_sigma = None
     .type = floats(size=3)
     .help = restraint target for diffuse sigma
-  diffuse_gamma = [1,1,1]
+  diffuse_gamma = None
     .type = floats(size=3)
     .help = restraint target for diffuse gamma
-  G = 100
+  G = None
     .type = float
     .help = restraint target for scale G
-  B = 0
+  B = None
     .type = float
     .help = restraint target for Bfactor
-  eta_abc = [0,0,0]
+  eta_abc = None
     .type = floats(size=3)
     .help = restraint target for mosaic spread angles in degrees
-  spec = [0,1]
+  spec = None
     .type = floats(size=2)
     .help = restraint target for specturm correction (0 + 1*Lambda )
 }
@@ -402,9 +439,9 @@
   diffuse_gamma = [1,1,1]
     .type = floats(size=3)
     .help = sensitivity for diffuse gamma
-  RotXYZ = [1,1,1]
+  RotXYZ = [1e-3,1e-3,1e-3]
     .type = floats(size=3)
-    .help = sensitivity for RotXYZ
+    .help = sensitivity for RotXYZ in radians
   G = 1
     .type = float
     .help = sensitivity for scale factor
@@ -451,7 +488,7 @@
     .help = init for diffuse gamma
   RotXYZ = [0,0,0]
     .type = floats(size=3)
-    .help = init for RotXYZ
+    .help = init for RotXYZ in radians
   G = 1
     .type = float
     .help = init for scale factor
@@ -482,9 +519,9 @@
   diffuse_gamma = [0,0,0]
     .type = floats(size=3)
     .help = min for diffuse gamma
-  RotXYZ = [-1,-1,-1]
+  RotXYZ = [-3.1415926, -3.1415926, -3.1415926]
     .type = floats(size=3)
-    .help = min for rotXYZ in degrees
+    .help = min for rotXYZ in radians
   G = 0
     .type = float
     .help = min for scale G
@@ -520,12 +557,12 @@
   diffuse_sigma = [20,20,20]
     .type = floats(size=3)
     .help = max diffuse sigma
-  diffuse_gamma = [1000,1000,1000]
+  diffuse_gamma = [10000,10000,10000]
     .type = floats(size=3)
     .help = max for diffuse gamma
-  RotXYZ = [1,1,1]
+  RotXYZ = [3.1415926, 3.1415926, 3.1415926]
     .type = floats(size=3)
-    .help = max for rotXYZ in degrees
+    .help = max for rotXYZ in radians
   G = 1e12
     .type = float
     .help = max for scale G
@@ -582,7 +619,7 @@
   ucell = False
     .type = bool
     .help = fix the unit cell during refinement
-  detz_shift = False
+  detz_shift = True
     .type = bool
     .help = fix the detector distance shift during refinement
 }
@@ -611,6 +648,9 @@
   .type = int
   .help = Increase to add accuracy to diffuse scattering models, at the expense of longer computations
   .help = Best to increment by values of 1 when testing
+diffuse_orientation = 1
+  .type = int
+  .help = orient the diffuse scattering features. 0 is along (a-b, a+b, c), 1 is along (a,b,c)
 symmetrize_diffuse = True
   .type = bool
   .help = use the laue group rotation operators to symmetrize diffuse signals
@@ -718,6 +758,9 @@
     .type = str
     .help = if logfiles=True, then write the log to this file, stored in the folder specified by outdir
     .help = if None, then defaults to main_stage1.log for hopper, main_pred.log for prediction, main_stage2.log for stage_two
+  log_hostname = True
+    .type = bool
+    .help = prefix logfiles with host name
 }
 profile = False
   .type = bool
diff --git a/simtbx/diffBragg/prep_stage2_input.py b/simtbx/diffBragg/prep_stage2_input.py
index 205128234f..2c08657a5c 100644
--- a/simtbx/diffBragg/prep_stage2_input.py
+++ b/simtbx/diffBragg/prep_stage2_input.py
@@ -4,6 +4,7 @@
 import numpy as np
 from dials.array_family import flex
 from libtbx.mpi4py import MPI
+from simtbx.diffBragg import utils
 COMM = MPI.COMM_WORLD
 
 import logging
@@ -27,20 +28,59 @@ def get_equal_partition(weights, partitions):
         load[lightest] += weights[idx]
     return distribution
 
-def prep_dataframe(df, refls_key="predictions"):
+def prep_dataframe(df, refls_key="predictions", res_ranges_string=None):
+    """
+
+    :param df: input pandas dataframe for stage2
+    :param refls_key: column in df containing the reflection filenames
+    :param res_ranges_string: optional res_ranges_string phil param (params.refiner.res_ranges)
+    :return:
+    """
     # TODO make sure all pred files exist
+
+    res_ranges = None
+    if res_ranges_string is not None:
+        res_ranges = utils.parse_reso_string(res_ranges_string)
+
+    if refls_key not in list(df):
+        raise KeyError("Dataframe has no key %s" % refls_key)
     nshots = len(df)
-    refls_names = df[refls_key]
+    df.reset_index(drop=True, inplace=True)
+    df['index'] = df.index.values
+    refl_info = df[["index", refls_key, "exp_idx"]].values
+
+    # sort and split such that each rank will read many refls from same file
+    sorted_names_and_ids = sorted(
+        refl_info,
+        key=lambda x: x[1])  # sort by name
+    df_idx, refl_names, expt_ids = np.array_split(sorted_names_and_ids, COMM.size)[COMM.rank].T
+
     refls_per_shot = []
     if COMM.rank==0:
         LOGGER.info("Loading nrefls per shot")
-    for i_shot, name in enumerate(refls_names):
-        if i_shot % COMM.size != COMM.rank:
-            continue
-        R = flex.reflection_table.from_file(name)
-        if len(R)==0:
-            LOGGER.critical("Reflection %s has 0 reflections !" % (name, len(R)))
-        refls_per_shot.append((i_shot, len(R)))
+
+    prev_name = ""  # keep track of the most recently read refl table file
+    Rall = None
+    for (i_shot, name, expt_id) in zip(df_idx, refl_names, expt_ids):
+        if Rall is None or name != prev_name:
+            Rall = flex.reflection_table.from_file(name)
+            prev_name = name
+
+        R = Rall.select(Rall['id'] == int(expt_id))
+        if res_ranges is not None:
+            num_ref = 0
+            if 'rlp' not in set(R.keys()):
+                raise KeyError("Cannot filter res ranges if rlp column not in refl tables")
+            d = 1. / np.linalg.norm(R["rlp"], axis=1)  # resolution per refl
+            for d_fine, d_coarse in res_ranges:
+                d_sel = np.logical_and(d >= d_fine, d < d_coarse)
+                num_ref += d_sel.sum()
+        else:
+            num_ref = len(R)
+
+        if num_ref==0:
+            LOGGER.critical("Reflection %s id=%d has 0 reflections !" % (name, expt_id, num_ref))
+        refls_per_shot.append((i_shot, num_ref))
 
     refls_per_shot = COMM.reduce(refls_per_shot, root=0)
     work_distribution = None
diff --git a/simtbx/diffBragg/refiners/base_refiner.py b/simtbx/diffBragg/refiners/base_refiner.py
index c74cd44436..f5179e95fa 100644
--- a/simtbx/diffBragg/refiners/base_refiner.py
+++ b/simtbx/diffBragg/refiners/base_refiner.py
@@ -210,12 +210,14 @@ def run(self, setup=True, setup_only=False):
         else:
             try:
                 self.diag_mode = None
+
                 self.minimizer = scitbx.lbfgs.run(
                     target_evaluator=self,
                     core_params=self._core_param,
                     exception_handling_params=self._handler,
                     termination_params=self._terminator,
                     gradient_only=self.gradient_only)
+
             except BreakToUseCurvatures:
                 self.hit_break_to_use_curvatures = True
                 pass
diff --git a/simtbx/diffBragg/refiners/stage_two_refiner.py b/simtbx/diffBragg/refiners/stage_two_refiner.py
index b877c40f70..3e7beb41fa 100644
--- a/simtbx/diffBragg/refiners/stage_two_refiner.py
+++ b/simtbx/diffBragg/refiners/stage_two_refiner.py
@@ -11,6 +11,8 @@
 import warnings
 import signal
 import logging
+from copy import deepcopy
+from simtbx.diffBragg import hopper_io
 
 LOGGER = logging.getLogger("diffBragg.main")
 warnings.filterwarnings("ignore")
@@ -53,7 +55,6 @@ class Bcolors:
 from simtbx.diffBragg.refiners import BreakBecauseSignal, BreakToUseCurvatures
 from dials.array_family import flex
 from simtbx.diffBragg.refiners import BaseRefiner
-from collections import Counter
 from cctbx import miller, sgtbx
 from simtbx.diffBragg.refiners.parameters import RangedParameter
 
@@ -74,11 +75,10 @@ def __init__(self, shot_modelers, sgsymbol, params):
         self.save_model_freq = self.params.refiner.stage_two.save_model_freq
         self.use_nominal_h = self.params.refiner.stage_two.use_nominal_hkl
 
-        self.saveZ_freq = self.params.refiner.stage_two.save_Z_freq  # save Z-score data every N iterations
+        self.saveZ_freq = self.params.refiner.stage_two.save_Z_freq  # save Z-score data every N function calls
         self.break_signal = None  # check for this signal during refinement, and break refinement if signal is received (see python signal module) TODO: make work with MPI
         self.save_model = False  # whether to save the model
-        self.idx_from_asu = {}  # maps global fcell index to asu hkl
-        self.asu_from_idx = {}  # maps asu hkl to global fcell index
+        self.hiasu = None  # stores Hi_asu, counts, maps to and from fcell indices
         self.rescale_params = True  # whether to rescale parameters during refinement  # TODO this will always be true, so remove the ability to disable
         self.request_diag_once = False  # LBFGS refiner property
         self.min_multiplicity = self.params.refiner.stage_two.min_multiplicity
@@ -91,6 +91,7 @@ def __init__(self, shot_modelers, sgsymbol, params):
         self.use_curvatures_threshold = 7  # how many positive curvature iterations required before breaking, after which simulation can be restart with use_curvatures=True
         self.verbose = True  # whether to print during iterations
         self.iterations = 0  # iteration counter , used internally
+        self.target_eval_count = 0  # target function evaluation counter, used internally
         self.shot_ids = None  # for global refinement ,
         self.log2pi = np.log(np.pi*2)
 
@@ -151,7 +152,7 @@ def n(self):
 
     @property
     def n_global_fcell(self):
-        return len(self.idx_from_asu)
+        return self.hiasu.present_len
 
     @property
     def image_shape(self):
@@ -190,16 +191,20 @@ def make_output_dir(self):
         self.Zdir = os.path.join(self.output_dir, "Z")
         self.model_dir = os.path.join(self.output_dir, "model")
         for dirname in (self.Zdir, self.model_dir):
-            if self.I_AM_ROOT and not os.path.exists(dirname):
+            if self.params.debug_mode and self.I_AM_ROOT and not os.path.exists(dirname):
                 os.makedirs(dirname)
         COMM.barrier()
 
     def _setup(self):
         # Here we go!  https://youtu.be/7VvkXA6xpqI
+        if not self.params.debug_mode:
+            LOGGER.info("Disabling saveZ and save_model because debug_mode=False")
+            self.saveZ_freq = None
+            self.save_model_freq = None
         LOGGER.info("Setup begins!")
-        if self.refine_Fcell and not self.asu_from_idx:
+        if self.refine_Fcell and not self.hiasu.from_idx:
             raise ValueError("Need to supply a non empty asu from idx map")
-        if self.refine_Fcell and not self.idx_from_asu:  # # TODO just derive from its inverse
+        if self.refine_Fcell and not self.hiasu.to_idx:
             raise ValueError("Need to supply a non empty idx from asu map")
 
         self.make_output_dir()
@@ -240,18 +245,12 @@ def _setup(self):
         self._setup_ncells_refinement_parameters()
         self._track_num_times_pixel_was_modeled()
 
-        self.hkl_totals = []
-        if self.refine_Fcell:
-            for i_shot in self.shot_ids:
-                for i_h, h in enumerate(self.Modelers[i_shot].Hi_asu):
-                    self.hkl_totals.append(self.idx_from_asu[h])
-            self.hkl_totals = self._MPI_reduce_broadcast(self.hkl_totals)
-
+        self._setup_nominal_hkl_p1()
         self._MPI_setup_global_params()
         self._MPI_sync_fcell_parameters()
         # reduce then broadcast fcell
         LOGGER.info("--combining parameters across ranks")
-        self._MPI_sync_hkl_freq()
+        self._MPI_sync_hkl_freq()  # FIXME does this do absolutely anything?
 
         if self.x_init is not None:
             LOGGER.info("Initializing with provided x_init array")
@@ -270,7 +269,7 @@ def _setup(self):
 
         for sid in self.shot_ids:
             Modeler = self.Modelers[sid]
-            Modeler.all_fcell_global_idx = np.array([self.idx_from_asu[h] for h in Modeler.hi_asu_perpix])
+            Modeler.all_fcell_global_idx = np.array([self.hiasu.to_idx[h] for h in Modeler.hi_asu_perpix])
             Modeler.unique_i_fcell = set(Modeler.all_fcell_global_idx)
             Modeler.i_fcell_slices = self._get_i_fcell_slices(Modeler)
             self.Modelers[sid] = Modeler  # TODO: VERIFY IF THIS IS NECESSARY ?
@@ -349,7 +348,7 @@ def _make_p1_equiv_mapping(self):
         self.num_equivs_for_i_fcell = {}
         self.update_indices = []
         for i_fcell in range(self.n_global_fcell):
-            hkl_asu = self.asu_from_idx[i_fcell]
+            hkl_asu = self.hiasu.from_idx[i_fcell]
 
             equivs = [i.h() for i in miller.sym_equiv_indices(self.space_group, hkl_asu).indices()]
             self.num_equivs_for_i_fcell[i_fcell] = len(equivs)
@@ -360,27 +359,45 @@ def _MPI_setup_global_params(self):
         if self.I_AM_ROOT:
             LOGGER.info("--2 Setting up global parameters")
             if self.output_dir is not None:
-                np.save(os.path.join(self.output_dir, "f_asu_map"), self.asu_from_idx)
+                np.save(os.path.join(self.output_dir, "f_asu_map"), self.hiasu.from_idx)
 
             self._setup_fcell_params()
 
+    def _setup_nominal_hkl_p1(self):
+        Omatrix = np.reshape(self.S.crystal.Omatrix.elems, [3, 3])
+        for i_shot in self.Modelers:
+            MOD = self.Modelers[i_shot]
+            nom_h = MOD.all_nominal_hkl
+            nom_h_p1 = np.dot(nom_h, Omatrix).astype(np.int32)
+            nom_h_p1 = list(map(tuple, nom_h_p1))
+            self.Modelers[i_shot].all_nominal_hkl_p1 = nom_h_p1
+
     def _setup_fcell_params(self):
         if self.refine_Fcell:
             LOGGER.info("----loading fcell data")
             # this is the number of observations of hkl (accessed like a dictionary via global_fcell_index)
             LOGGER.info("---- -- counting hkl totes")
             LOGGER.info("compute HKL multiplicity")
-            self.hkl_frequency = Counter(self.hkl_totals)
+            self.hkl_frequency = self.hiasu.present_idx_counter
             LOGGER.info("save HKL multiplicity")
             np.save(os.path.join(self.output_dir, "f_asu_multi"), self.hkl_frequency)
             LOGGER.info("Done ")
 
             LOGGER.info("local refiner symbol=%s ; nanoBragg crystal symbol: %s" % (self.symbol, self.S.crystal.symbol))
-            ma = self.S.crystal.miller_array_high_symmetry.map_to_asu()
+            self.fcell_init_from_i_fcell = []
+            ma = self.S.crystal.miller_array
             LOGGER.info("make an Fhkl map")
             ma_map = {h: d for h,d in zip(ma.indices(), ma.data())}
-            LOGGER.info("make fcell_init")
-            self.fcell_init_from_i_fcell = np.array([ma_map[self.asu_from_idx[i_fcell]] for i_fcell in range(self.n_global_fcell)])
+            Omatrix = np.reshape(self.S.crystal.Omatrix.elems,[3,3])
+
+            # TODO: Vectorize
+            for i_fcell in range(self.n_global_fcell):
+                asu_hkl = self.hiasu.from_idx[i_fcell]  # high symmetry
+                P1_hkl = tuple(np.dot(Omatrix, asu_hkl).astype(int))
+                fcell_val = ma_map[P1_hkl]
+                self.fcell_init_from_i_fcell.append(fcell_val)
+            self.fcell_init_from_i_fcell = np.array(self.fcell_init_from_i_fcell)
+
             self.fcell_sigmas_from_i_fcell = self.params.sigmas.Fhkl
             LOGGER.info("DONE make fcell_init")
 
@@ -436,7 +453,9 @@ def _get_ncells_abc(self, i_shot):
         return vals
 
     def _get_eta(self, i_shot):
-        pass
+        # NOTE: refinement of eta not supported in this script
+        vals = [self.Modelers[i_shot].PAR.eta[i_eta].init for i_eta in range(3)]
+        return vals
 
     def _get_spot_scale(self, i_shot):
         xval = self.x[self.spot_scale_xpos[i_shot]]
@@ -465,8 +484,8 @@ def _run_diffBragg_current(self):
         LOGGER.info("run diffBragg for shot %d" % self._i_shot)
         pfs = self.Modelers[self._i_shot].pan_fast_slow
         if self.use_nominal_h:
-            nom_h = self.Modelers[self._i_shot].all_nominal_hkl
-            self.D.add_diffBragg_spots(pfs, nom_h)
+            nom_h_p1 = self.Modelers[self._i_shot].all_nominal_hkl_p1
+            self.D.add_diffBragg_spots(pfs, nom_h_p1)
         else:
             self.D.add_diffBragg_spots(pfs)
         LOGGER.info("finished diffBragg for shot %d" % self._i_shot)
@@ -506,7 +525,30 @@ def _update_spectra_coefficients(self):
         pass
 
     def _update_eta(self):
-        pass
+
+        if self.S.umat_maker is not None:
+            eta_vals = self._get_eta(self._i_shot)
+
+            if not self.D.has_anisotropic_mosaic_spread:
+                assert self.S.Umats_method == 2
+                assert len(set(eta_vals))==1
+                eta_vals = eta_vals[0]
+
+            LOGGER.info("eta=%f" % eta_vals)
+            self.S.update_umats_for_refinement(eta_vals)
+
+    def _symmetrize_Flatt(self):
+        if self.params.symmetrize_Flatt:
+            # NOTE: RotXYZ refinement disabled for this script, so offsets always 0,0,0
+            RXYZU = hopper_io.diffBragg_Umat(0,0,0,self.D.Umatrix)
+            Cryst = deepcopy(self.S.crystal.dxtbx_crystal)
+            B_realspace = self.get_refined_Bmatrix(self._i_shot, recip=False)
+            A = RXYZU * B_realspace
+            A_recip = A.inverse().transpose()
+            Cryst.set_A(A_recip)
+            symbol = self.S.crystal.space_group_info.type().lookup_symbol()
+            self.D.set_mosaic_blocks_sym(Cryst, symbol , self.params.simulator.crystal.num_mosaicity_samples,
+                                        refining_eta=False) # NOTE:no eta refinement in this stage 2 script (possible in ens.hopper)
 
     def _set_background_plane(self):
         self.tilt_plane = self.Modelers[self._i_shot].all_background[self.roi_sel]
@@ -626,11 +668,11 @@ def compute_functional_and_gradients(self):
         t = time.time()
         out = self._compute_functional_and_gradients()
         t = time.time()-t
-        LOGGER.info("TOok %.4f sec to compute functional and grad" % t)
+        LOGGER.info("Took %.4f sec to compute functional and grad" % t)
         return out
 
     def _compute_functional_and_gradients(self):
-        LOGGER.info(Bcolors.OKBLUE+"BEGIN FUNC GRAD ; iteration %d" % self.iterations+Bcolors.ENDC)
+        LOGGER.info(Bcolors.OKBLUE+"BEGIN FUNC GRAD ; Eval %d" % self.target_eval_count+Bcolors.ENDC)
         #if self.verbose:
         #    self._print_iteration_header()
 
@@ -654,15 +696,15 @@ def _compute_functional_and_gradients(self):
 
         LOGGER.info("Iterate over %d shots" % len(self.shot_ids))
         self._shot_Zscores = []
-        save_model = self.save_model_freq is not None and self.iterations % self.save_model_freq == 0
+        save_model = self.save_model_freq is not None and self.target_eval_count % self.save_model_freq == 0
         if save_model:
-            self._save_model_dir = os.path.join(self.model_dir, "iter%d" % self.iterations)
+            self._save_model_dir = os.path.join(self.model_dir, "eval%d" % self.target_eval_count)
 
-            if COMM.rank == 0 and not os.path.exists(self._save_model_dir):
+            if self.params.debug_mode and COMM.rank == 0 and not os.path.exists(self._save_model_dir):
                 os.makedirs(self._save_model_dir)
             COMM.barrier()
 
-        if self.iterations % self.params.refiner.save_gain_freq == 0:
+        if self.target_eval_count % self.params.refiner.save_gain_freq == 0:
             self._save_optimized_gain_map()
 
         self.all_sigZ = []
@@ -684,6 +726,7 @@ def _compute_functional_and_gradients(self):
             self._update_ncells_def()
             self._update_rotXYZ()
             self._update_eta()  # mosaic spread
+            self._symmetrize_Flatt()
             self._update_dxtbx_detector()
             self._update_sausages()
 
@@ -702,7 +745,7 @@ def _compute_functional_and_gradients(self):
 
             self._derivative_convenience_factors()
 
-            if self.iterations % self.saveZ_freq == 0:
+            if self.saveZ_freq is not None and self.target_eval_count % self.saveZ_freq == 0:
                 MOD = self.Modelers[self._i_shot]
                 self._spot_Zscores = []
                 for i_fcell in MOD.unique_i_fcell:
@@ -772,7 +815,7 @@ def _compute_functional_and_gradients(self):
         tsave = time.time()-tsave
         LOGGER.info("Time to dump param and Zscore data: %.4f" % tsave)
 
-        self.iterations += 1
+        self.target_eval_count += 1
         self.f_vals.append(self.target_functional)
 
         if self.calc_curvatures and not self.use_curvatures:
@@ -782,21 +825,24 @@ def _compute_functional_and_gradients(self):
         LOGGER.info("DONE WITH FUNC GRAD")
         return self._f, self._g
 
+    def callback_after_step(self, minimizer):
+        self.iterations = minimizer.iter()
+
     def _save_model(self, model_info):
         LOGGER.info("SAVING MODEL FOR SHOT %d" % self._i_shot)
         df = pandas.DataFrame(model_info)
         df["shot_id"] = self._i_shot
         outdir = self._save_model_dir
-        outname = os.path.join(outdir, "rank%d_shot%d_ITER%d.pkl" % (COMM.rank, self._i_shot, self.iterations))
+        outname = os.path.join(outdir, "rank%d_shot%d_EVAL%d_ITER%d.pkl" % (COMM.rank, self._i_shot, self.target_eval_count, self.iterations))
         df.to_pickle(outname)
 
     def _save_Zscore_data(self):
-        if not self.iterations % self.saveZ_freq == 0:
+        if self.saveZ_freq is None or not self.target_eval_count % self.saveZ_freq == 0:
             return
         outdir = os.path.join(self.Zdir, "rank%d_Zscore" % self.rank)
         if not os.path.exists(outdir):
             os.makedirs(outdir)
-        fname = os.path.join(outdir, "sigZ_iter%d_rank%d" % (self.iterations, self.rank))
+        fname = os.path.join(outdir, "sigZ_eval%d_iter%d_rank%d" % (self.target_eval_count, self.iterations, self.rank))
         np.save(fname, np.array(self._shot_Zscores, object))
 
     def _sanity_check_grad(self):
@@ -827,6 +873,7 @@ def _Fcell_derivatives(self):
         if not self.refine_Fcell:
             return
         MOD = self.Modelers[self._i_shot]
+        dumps = []
         for i_fcell in MOD.unique_i_fcell:
 
             multi = self.hkl_frequency[i_fcell]
@@ -836,6 +883,7 @@ def _Fcell_derivatives(self):
             xpos = self.fcell_xstart + i_fcell
             Famp = self._fcell_at_i_fcell[i_fcell]
             sig = 1
+
             for slc in MOD.i_fcell_slices[i_fcell]:
                 self.fcell_dI_dtheta = self.fcell_deriv[slc]
 
@@ -852,7 +900,9 @@ def _Fcell_derivatives(self):
                 trust = MOD.all_trusted[slc]
                 # NOTE : no need to normalize Fhkl gradients by the overlap rate - they should arise from different HKLs
                 #freq = MOD.all_freq[slc]  # pixel frequency (1 is no overlaps)
-                self.grad[xpos] += (g_accum[trust].sum())*.5
+                dump = (g_accum[trust].sum())*.5
+                self.grad[xpos] += dump
+                dumps.append(dump)
                 if self.calc_curvatures:
                     raise NotImplementedError("No curvature for Fcell refinement")
 
@@ -985,11 +1035,11 @@ def _print_iteration_header(self):
         if self.use_curvatures:
 
             LOGGER.info(
-                "%s%s%s%s\nTrial%d (%s): Compute functional and gradients Iter %d %s(Using Curvatures)%s\n%s%s%s%s"
-                % (Bcolors.HEADER, border,border,border, self.trial_id + 1, refine_str, self.iterations + 1, Bcolors.OKGREEN, Bcolors.HEADER, border,border,border, Bcolors.ENDC))
+                "%s%s%s%s\nTrial%d (%s): Compute functional and gradients eval %d %s(Using Curvatures)%s\n%s%s%s%s"
+                % (Bcolors.HEADER, border,border,border, self.trial_id + 1, refine_str, self.target_eval_count + 1, Bcolors.OKGREEN, Bcolors.HEADER, border,border,border, Bcolors.ENDC))
         else:
-            LOGGER.info("%s%s%s%s\n, Trial%d (%s): Compute functional and gradients Iter %d PosCurva %d\n%s%s%s%s"
-                  % (Bcolors.HEADER, border, border, border, self.trial_id + 1, refine_str, self.iterations + 1, self.num_positive_curvatures, border, border,border, Bcolors.ENDC))
+            LOGGER.info("%s%s%s%s\n, Trial%d (%s): Compute functional and gradients eval %d PosCurva %d\n%s%s%s%s"
+                  % (Bcolors.HEADER, border, border, border, self.trial_id + 1, refine_str, self.target_eval_count + 1, self.num_positive_curvatures, border, border,border, Bcolors.ENDC))
 
     def _save_optimized_gain_map(self):
         if not self.params.refiner.refine_gain_map:
@@ -1006,7 +1056,7 @@ def _save_optimized_gain_map(self):
 
     def _MPI_save_state_of_refiner(self):
         if self.I_AM_ROOT and self.output_dir is not None and self.refine_Fcell:
-            outf = os.path.join(self.output_dir, "_fcell_trial%d_iter%d" % (self.trial_id, self.iterations))
+            outf = os.path.join(self.output_dir, "_fcell_trial%d_eval%d_iter%d" % (self.trial_id, self.target_eval_count, self.iterations))
             np.savez(outf, fvals=self._fcell_at_i_fcell)
 
     def _target_accumulate(self):
@@ -1071,8 +1121,11 @@ def _evaluate_log_averageI_plus_sigma_readout(self):
         self.log_v = np.log(v)
         self.log_v[v <= 0] = 0  # but will I ever negative_model ?
 
-    def get_refined_Bmatrix(self, i_shot):
-        return self.Modelers[i_shot].PAR.ucell_man.B_recipspace
+    def get_refined_Bmatrix(self, i_shot, recip=False):
+        if recip:
+            return self.Modelers[i_shot].PAR.ucell_man.B_recipspace
+        else:
+            return self.Modelers[i_shot].PAR.ucell_man.B_realspace
 
     def curvatures(self):
         return self.curv
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 8d2915db35..f07b7e99c2 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -537,6 +537,7 @@ void diffBragg::update_dxtbx_geoms(
     double panel_rot_angS, double panel_offsetX, double panel_offsetY, double panel_offsetZ,
     bool force){
 
+    db_cu_flags.update_detector = true;
     int old_verbose = verbose;
     verbose = 0;
 
@@ -714,12 +715,18 @@ void diffBragg::update_dxtbx_geoms(
     SCITBX_ASSERT(close_distance > 0);
     verbose = old_verbose;
     set_close_distances();
+    db_cu_flags.update_panel_deriv_vecs=true;
+    db_cu_flags.update_detector=true;
     }
 
 void diffBragg::shift_originZ(const dxtbx::model::Detector& detector, double shiftZ){
-    for (int pid=0; pid< detector.size(); pid++)
-        db_det.pix0_vectors[pid*3 + 2] = detector[pid].get_origin()[2]/1000.0 + shiftZ;
-    set_close_distances();
+    if (shiftZ != prev_shiftZ){
+        for (int pid=0; pid< detector.size(); pid++)
+            db_det.pix0_vectors[pid*3 + 2] = detector[pid].get_origin()[2]/1000.0 + shiftZ;
+        set_close_distances();
+        db_cu_flags.update_detector=true;
+        prev_shiftZ = shiftZ;
+    }
 }
 
 void diffBragg::init_raw_pixels_roi(){
@@ -749,7 +756,7 @@ void diffBragg::initialize_managers(){
         pan_orig = boost::dynamic_pointer_cast<panel_manager>(panels[1+i_pan_orig]);
         if (pan_orig->refine_me){
             pan_orig->initialize(Npix_total, compute_curvatures);
-            update_detector_on_device=true;
+            db_cu_flags.update_detector=true;
         }
     }
 
@@ -757,7 +764,8 @@ void diffBragg::initialize_managers(){
         fcell_managers[0]->initialize(Npix_total, compute_curvatures);
         //fcell_managers[1]->initialize(Npix_total, compute_curvatures);
         //fcell_managers[2]->initialize(Npix_total, compute_curvatures);
-        update_Fhkl_on_device = true;
+        db_cu_flags.update_Fhkl = true;
+        db_cu_flags.update_Fhkl_scales = true;
         }
 
     for (int i_eta=0; i_eta<3; i_eta++){
@@ -785,7 +793,7 @@ void diffBragg::initialize_managers(){
         pan_rot = boost::dynamic_pointer_cast<panel_manager>(panels[manager_idx]);
         if (pan_rot->refine_me){
             update_panel_deriv_vecs_on_device=true;
-            update_detector_on_device=true;
+            db_cu_flags.update_detector=true;
             pan_rot->initialize(Npix_total, compute_curvatures);
         }
     }
@@ -797,6 +805,20 @@ void diffBragg::initialize_managers(){
 
 }
 
+af::shared<mat3>
+diffBragg::get_mosaic_blocks_prime() {
+    af::shared<mat3> result;
+    int num_blocks = mosaic_domains;
+    // TODO if refining aniso eta, then num_blocks for the primes is 3*mosaic_domains, address this case
+    for(mos_tic=0;mos_tic<num_blocks;++mos_tic) {
+        result.push_back(mat3( mosaic_umats_prime[9*mos_tic+0], mosaic_umats_prime[9*mos_tic+1], mosaic_umats_prime[9*mos_tic+2],
+                               mosaic_umats_prime[9*mos_tic+3], mosaic_umats_prime[9*mos_tic+4], mosaic_umats_prime[9*mos_tic+5],
+                               mosaic_umats_prime[9*mos_tic+6], mosaic_umats_prime[9*mos_tic+7], mosaic_umats_prime[9*mos_tic+8]));
+    }
+    return result;
+}
+
+
 void diffBragg::vectorize_umats(){
     /* vector store two copies of Umats, one unperturbed for reference */
     if (db_cryst.UMATS.size() > 0){
@@ -1000,12 +1022,13 @@ void diffBragg::refine(int refine_id){
         boost::shared_ptr<panel_manager> pan_orig = boost::dynamic_pointer_cast<panel_manager>(panels[1]);
         pan_orig->refine_me=true;
         pan_orig->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device=true;
+        db_cu_flags.update_detector=true;
     }
     else if(refine_id==11){
         fcell_managers[0]->refine_me=true;
         fcell_managers[0]->initialize(Npix_total, compute_curvatures);
-        update_Fhkl_on_device = true;
+        db_cu_flags.update_Fhkl = true;
+        db_cu_flags.update_Fhkl_scales = true;
     }
 
     else if (refine_id==12 || refine_id==13){
@@ -1020,7 +1043,7 @@ void diffBragg::refine(int refine_id){
         pan_rot->refine_me=true;
         rotate_fs_ss_vecs_3D(0,0,0);
         pan_rot->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device = true;
+        db_cu_flags.update_detector = true;
         update_panel_deriv_vecs_on_device = true;
     }
 
@@ -1028,21 +1051,21 @@ void diffBragg::refine(int refine_id){
         boost::shared_ptr<panel_manager> pan_orig = boost::dynamic_pointer_cast<panel_manager>(panels[2]);
         pan_orig->refine_me=true;
         pan_orig->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device = true;
+        db_cu_flags.update_detector = true;
     }
 
     else if (refine_id==16){
         boost::shared_ptr<panel_manager> pan_orig = boost::dynamic_pointer_cast<panel_manager>(panels[3]);
         pan_orig->refine_me=true;
         pan_orig->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device = true;
+        db_cu_flags.update_detector = true;
     }
     else if (refine_id==17){
         boost::shared_ptr<panel_manager> pan_rot = boost::dynamic_pointer_cast<panel_manager>(panels[4]);
         pan_rot->refine_me=true;
         rotate_fs_ss_vecs_3D(0,0,0);
         pan_rot->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device = true;
+        db_cu_flags.update_detector = true;
         update_panel_deriv_vecs_on_device = true;
     }
     else if (refine_id==18){
@@ -1050,7 +1073,7 @@ void diffBragg::refine(int refine_id){
         pan_rot->refine_me=true;
         rotate_fs_ss_vecs_3D(0,0,0);
         pan_rot->initialize(Npix_total, compute_curvatures);
-        update_detector_on_device = true;
+        db_cu_flags.update_detector = true;
         update_panel_deriv_vecs_on_device = true;
     }
     else if (refine_id==19){
@@ -1131,6 +1154,7 @@ void diffBragg::update_Fhkl_channels(np::ndarray& channels){
         if (verbose)
             printf("source=%d channel_id=%d\n", i, channel_id);
     }
+    db_cu_flags.update_Fhkl_channels=true;
 }
 
 boost::python::list diffBragg::get_Fhkl_channels(){
@@ -1198,6 +1222,7 @@ void diffBragg::quick_Fcell_update(boost::python::tuple const& value){
     }
     //update_linear_Fhkl();
     if(verbose) printf("done with quick update of Fhkl:\n");
+    db_cu_flags.update_Fhkl=true;
 }
 
 
@@ -1797,6 +1822,7 @@ void diffBragg::update_Fhkl_scale_factors(np::ndarray& scale_factors, int num_Fh
     bool init_scales=first_deriv_imgs.Fhkl_scale.empty();
     db_flags.Fhkl_have_scale_factors = true;
     db_cryst.num_Fhkl_channels=num_Fhkl_channels;
+    db_cu_flags.update_Fhkl_scales=true;
 
     double* scale_ptr = reinterpret_cast<double*>(scale_factors.get_data());
     for(int i_chan=0; i_chan< num_Fhkl_channels; i_chan++){
@@ -1843,41 +1869,55 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
     int pan_rot_ids[3] = {0,4,5};
     int pan_orig_ids[3] = {1,2,3};
 
-    db_flags.refine_panel_rot.resize(3, false);
-    db_flags.refine_panel_origin.resize(3,false);
-    db_flags.refine_lambda.resize(2,false);
-    db_flags.refine_Bmat.resize(6,false);
-    db_flags.refine_Umat.resize(3,false);
-    db_flags.refine_Ncells.resize(3,false);
+    if (db_flags.refine_panel_rot.empty()){ // if one is empty, they're all empty
+        db_flags.refine_panel_rot.resize(3, false);
+        db_flags.refine_panel_origin.resize(3,false);
+        db_flags.refine_lambda.resize(2,false);
+        db_flags.refine_Bmat.resize(6,false);
+        db_flags.refine_Umat.resize(3,false);
+        db_flags.refine_Ncells.resize(3,false);
+        db_cu_flags.update_refine_flags=true;
+    }
 
     for(int i_pan=0;i_pan < 3; i_pan++){
         int i_pan_rot = pan_rot_ids[i_pan];
         int i_pan_orig = pan_orig_ids[i_pan];
-        if (panels[i_pan_rot]->refine_me)
-            db_flags.refine_panel_rot[i_pan] = true;
-        if (panels[i_pan_orig]-> refine_me)
-            db_flags.refine_panel_origin[i_pan] = true;
+        if (panels[i_pan_rot]->refine_me != db_flags.refine_panel_rot[i_pan]){
+            db_flags.refine_panel_rot[i_pan] = panels[i_pan_rot]->refine_me;
+            db_cu_flags.update_refine_flags=true;
+        }
+        if (panels[i_pan_orig]-> refine_me != db_flags.refine_panel_origin[i_pan_orig]){
+            db_flags.refine_panel_origin[i_pan] = panels[i_pan_orig]->refine_me;
+            db_cu_flags.update_refine_flags=true;
+        }
     }
     for (int i_uc = 0; i_uc < 6; i_uc++){
-        if (ucell_managers[i_uc]->refine_me)
-            db_flags.refine_Bmat[i_uc] = true;
+        if (ucell_managers[i_uc]->refine_me != db_flags.refine_Bmat[i_uc]){
+            db_flags.refine_Bmat[i_uc] = ucell_managers[i_uc]->refine_me;
+            db_cu_flags.update_refine_flags=true;
+        }
     }
 
     for (int i_rot =0; i_rot< 3; i_rot ++){
-        if (rot_managers[i_rot]->refine_me)
-            db_flags.refine_Umat[i_rot] = true;
+        if (rot_managers[i_rot]->refine_me != db_flags.refine_Umat[i_rot]){
+            db_flags.refine_Umat[i_rot] = rot_managers[i_rot]->refine_me;
+            db_cu_flags.update_refine_flags=true;
+        }
     }
 
     for (int i_lam=0; i_lam< 2; i_lam++){
-        if (lambda_managers[i_lam]->refine_me)
-            db_flags.refine_lambda[i_lam] = true;
+        if (lambda_managers[i_lam]->refine_me != db_flags.refine_lambda[i_lam]){
+            db_flags.refine_lambda[i_lam] = lambda_managers[i_lam]->refine_me;
+            db_cu_flags.update_refine_flags=true;
+        }
     }
 
-    if (Ncells_managers[0]->refine_me){
-        db_flags.refine_Ncells[0] = true;
-        if (! isotropic_ncells){
-            db_flags.refine_Ncells[1] = true;
-            db_flags.refine_Ncells[2] = true;
+    for (int i_nc=0; i_nc<3; i_nc++){
+        if (isotropic_ncells && i_nc >0)
+            continue;
+        if (Ncells_managers[i_nc]->refine_me != db_flags.refine_Ncells[i_nc]){
+            db_flags.refine_Ncells[i_nc] = Ncells_managers[i_nc]->refine_me;
+            db_cu_flags.update_refine_flags=true;
         }
     }
 
@@ -2026,7 +2066,7 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
 
     //fudge = 1.1013986013; // from manuscript computation
     gettimeofday(&t1,0 );
-    if ((! use_cuda && getenv("DIFFBRAGG_USE_CUDA")==NULL && getenv("DIFFBRAGG_USE_KOKKOS")==NULL ) || force_cpu){
+    if ((! use_gpu && getenv("DIFFBRAGG_USE_CUDA")==NULL && getenv("DIFFBRAGG_USE_KOKKOS")==NULL ) || force_cpu){
         diffBragg_sum_over_steps(
             Npix_to_model, panels_fasts_slows_vec,
             image,
@@ -2044,16 +2084,18 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
         db_cu_flags.device_Id = device_Id;
         db_cu_flags.update_step_positions = update_step_positions_on_device;
         db_cu_flags.update_panels_fasts_slows = update_panels_fasts_slows_on_device;
-        db_cu_flags.update_sources = update_sources_on_device;
         db_cu_flags.update_umats = update_umats_on_device;
         db_cu_flags.update_dB_mats = update_dB_matrices_on_device;
         db_cu_flags.update_rotmats = update_rotmats_on_device;
-        db_cu_flags.update_Fhkl = update_Fhkl_on_device;
-        db_cu_flags.update_detector = update_detector_on_device;
-        db_cu_flags.update_refine_flags = update_refine_flags_on_device;
+        //db_cu_flags.update_Fhkl = update_Fhkl_on_device;
         db_cu_flags.update_panel_deriv_vecs = update_panel_deriv_vecs_on_device;
         db_cu_flags.Npix_to_allocate = Npix_to_allocate;
 
+      bool use_cuda = false;
+#ifdef DIFFBRAGG_HAVE_CUDA
+      use_cuda = use_gpu;
+#endif
+
       if (use_cuda || getenv("DIFFBRAGG_USE_CUDA")!=NULL){
 #ifdef DIFFBRAGG_HAVE_CUDA
         diffBragg_sum_over_steps_cuda(
@@ -2076,7 +2118,7 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
         SCITBX_ASSERT(DIFFBRAGG_USE_CUDA_flag_unsupported);
 #endif
       }
-      else {
+      else if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
 #ifdef DIFFBRAGG_HAVE_KOKKOS
         if (!diffBragg_runner) {
           diffBragg_runner = std::make_shared<diffBraggKOKKOS>();
@@ -2097,10 +2139,10 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
         if (verbose)
             printf("Ran the Kokkos kernel\n");
 #else
-    bool DIFFBRAGG_USE_KOKKOS_flag_unsupported=false;
-    SCITBX_ASSERT(DIFFBRAGG_USE_KOKKOS_flag_unsupported);
+        bool DIFFBRAGG_USE_KOKKOS_flag_unsupported=false;
+        SCITBX_ASSERT(DIFFBRAGG_USE_KOKKOS_flag_unsupported);
 #endif
-    }
+        }
     last_kernel_on_GPU=true;
 #else
     bool DIFFBRAGG_USE_KOKKOS_and_DIFFBRAGG_USE_CUDA_flags_unsupported=false;
@@ -2116,7 +2158,7 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
         printf("Nsteps=%d\noversample=%d\ndet_thick_steps=%d\nsources=%d\nphisteps=%d\nmosaic_domains=%d\n",
                 db_steps.Nsteps,oversample,detector_thicksteps,sources,phisteps,mosaic_domains);
         printf("DIFFBRAGG isotropic Ncells=%d\n", isotropic_ncells);
-        if(use_cuda || getenv("DIFFBRAGG_USE_CUDA")!= NULL)
+        if(use_gpu || getenv("DIFFBRAGG_USE_CUDA")!= NULL || getenv("DIFFBRAGG_USE_KOKKOS")!= NULL)
             printf("TIME TO RUN DIFFBRAGG -GPU- (%llu iterations):  %3.10f ms \n",n_total_iter, time);
         else
             printf("TIME TO RUN DIFFBRAGG -CPU- (%llu iterations):  %3.10f ms \n",n_total_iter, time);
@@ -2383,10 +2425,24 @@ void diffBragg::show_timing_stats(int MPI_RANK){ //}, boost_adaptbx::python::str
         printf("RANK%d TIMINGS: add_diffBragg_spots pre kernel wrapper: %10.3f\n", MPI_RANK, TIMERS.add_spots_pre );
         printf("RANK%d TIMINGS: add_diffBragg_spots post kernel wrapper: %10.3f\n", MPI_RANK , TIMERS.add_spots_post);
         printf("RANK%d TIMINGS: add_diffBragg_spots kernel wrapper: %10.3f\n", MPI_RANK, TIMERS.add_spots_kernel_wrapper );
-        printf("RANK%d TIMINGS: add_diffBragg_spots CUDA alloc: %10.3f\n", MPI_RANK, TIMERS.cuda_alloc );
-        printf("RANK%d TIMINGS: add_diffBragg_spots CUDA copy host to dev: %10.3f\n", MPI_RANK, TIMERS.cuda_copy_to_dev );
-        printf("RANK%d TIMINGS: add_diffBragg_spots CUDA copy dev to host: %10.3f\n", MPI_RANK, TIMERS.cuda_copy_from_dev );
-        printf("RANK%d TIMINGS: add_diffBragg_spots CUDA kernel: %10.3f\n", MPI_RANK, TIMERS.cuda_kernel );
+        printf("RANK%d TIMINGS: add_diffBragg_spots device alloc: %10.3f\n", MPI_RANK, TIMERS.cuda_alloc );
+        printf("RANK%d TIMINGS: add_diffBragg_spots copy host-to-dev: %10.3f\n", MPI_RANK, TIMERS.cuda_copy_to_dev );
+
+        printf("RANK%d TIMINGS: host-to-dev Fhkl scales: %10.3f\n", MPI_RANK, TIMERS.copy_Fhkl_scale );
+        printf("RANK%d TIMINGS: host-to-dev sources: %10.3f\n", MPI_RANK, TIMERS.copy_sources );
+        printf("RANK%d TIMINGS: host-to-dev umats: %10.3f\n", MPI_RANK, TIMERS.copy_umats );
+        printf("RANK%d TIMINGS: host-to-dev amats: %10.3f\n", MPI_RANK, TIMERS.copy_amats );
+        printf("RANK%d TIMINGS: host-to-dev bmats: %10.3f\n", MPI_RANK, TIMERS.copy_bmats );
+        printf("RANK%d TIMINGS: host-to-dev rotmats: %10.3f\n", MPI_RANK, TIMERS.copy_rotmats );
+        printf("RANK%d TIMINGS: host-to-dev det: %10.3f\n", MPI_RANK, TIMERS.copy_det );
+        printf("RANK%d TIMINGS: host-to-dev nomhkl: %10.3f\n", MPI_RANK, TIMERS.copy_nomhkl );
+        printf("RANK%d TIMINGS: host-to-dev flags: %10.3f\n", MPI_RANK, TIMERS.copy_flags );
+        printf("RANK%d TIMINGS: host-to-dev fhkl: %10.3f\n", MPI_RANK, TIMERS.copy_fhkl );
+        printf("RANK%d TIMINGS: host-to-dev detderiv: %10.3f\n", MPI_RANK, TIMERS.copy_detderiv );
+        printf("RANK%d TIMINGS: host-to-dev pfs: %10.3f\n", MPI_RANK, TIMERS.copy_pfs );
+
+        printf("RANK%d TIMINGS: add_diffBragg_spots copy dev-to-host: %10.3f\n", MPI_RANK, TIMERS.cuda_copy_from_dev );
+        printf("RANK%d TIMINGS: add_diffBragg_spots device kernel: %10.3f\n", MPI_RANK, TIMERS.cuda_kernel );
         printf("RANK%d TIMINGS: Total kernel calls=%d\n", MPI_RANK, TIMERS.timings );
     }
     else printf("RANK%d No timing has occured since instantiation of diffBragg\n", MPI_RANK);
diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h
index 36ee6c0afb..aa4983792f 100644
--- a/simtbx/diffBragg/src/diffBragg.h
+++ b/simtbx/diffBragg/src/diffBragg.h
@@ -139,6 +139,8 @@ class diffBragg: public nanoBragg{
 
   ~diffBragg(){};
 
+  af::shared<mat3> get_mosaic_blocks_prime();
+
   /// pixels
   double* floatimage_roi;
   af::flex_double raw_pixels_roi;
@@ -164,12 +166,21 @@ class diffBragg: public nanoBragg{
 
 #ifdef DIFFBRAGG_HAVE_KOKKOS
     // diffBragg_cudaPointers cuda_pointers;
-    void kokkos_free() { diffBragg_runner.reset(); }
+    inline void kokkos_free() { diffBragg_runner.reset(); }
     // allocate when needed to avoid problems with kokkos initialization when cuda/kokkos isn't used
     std::shared_ptr<diffBraggKOKKOS> diffBragg_runner{};
     // diffBraggKOKKOS diffBragg_runner;
 #endif
 
+    inline void gpu_free(){
+#ifdef DIFFBRAGG_HAVE_CUDA
+            cuda_free();
+#endif
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+            kokkos_free();
+#endif
+    }
+
   // methods
   void update_xray_beams(scitbx::af::versa<dxtbx::model::Beam, scitbx::af::flex_grid<> > const& value);
   void initialize_managers();
@@ -299,6 +310,7 @@ class diffBragg: public nanoBragg{
   double Nd, Ne, Nf;
   bool refine_Ncells_def;
   bool no_Nabc_scale;  // if true, then absorb the Nabc scale into an overall scale factor
+  double prev_shiftZ=0; // keep track of when detector Z was shifted  (helps determine when to set the update_detector flag for GPU devices
   Eigen::Matrix3d NABC;
 
   bool use_lambda_coefficients;
@@ -311,12 +323,11 @@ class diffBragg: public nanoBragg{
   bool update_rotmats_on_device=false;
   bool update_umats_on_device=false;
   bool update_panels_fasts_slows_on_device=false;
-  bool update_sources_on_device=false;
   bool update_Fhkl_on_device=false;
   bool update_refine_flags_on_device=false;
   bool update_step_positions_on_device=false;
   bool update_panel_deriv_vecs_on_device=false;
-  bool use_cuda=false;
+  bool use_gpu=false;
   bool force_cpu=false;
   int Npix_to_allocate=-1; // got GPU allocation, -1 is auto mode
 
diff --git a/simtbx/diffBragg/src/diffBraggCUDA.cu b/simtbx/diffBragg/src/diffBraggCUDA.cu
index 6b0cddd2ff..b318a189e5 100644
--- a/simtbx/diffBragg/src/diffBraggCUDA.cu
+++ b/simtbx/diffBragg/src/diffBraggCUDA.cu
@@ -444,6 +444,12 @@ void diffBragg_sum_over_steps_cuda(
     if (db_cryst.fpfdp.size() == 0){ // note cannot use atom data if fpfdp is 0, make this cleaner
         num_atoms=0;
     }
+
+    if (db_flags.use_diffuse) {
+        if (db_cryst.laue_group_num < 1 || db_cryst.laue_group_num >14 ){
+            throw std::string("Laue group number not in range 1-14");
+        }
+    }
     //int sm_size = number_of_sources*5*sizeof(CUDAREAL);
     //gpu_sum_over_steps<<<numblocks, blocksize, sm_size >>>(
     bool aniso_eta = db_cryst.UMATS_RXYZ.size() != db_cryst.UMATS_RXYZ_prime.size();
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
index f49508ed39..fb88a6f066 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
@@ -4,6 +4,45 @@
 #include "diffBraggKOKKOS.h"
 #include "diffBragg_kokkos_kernel.h"
 
+#define PRINTOUT(flag, function, ...) \
+    if (flag) {                       \
+        function<true>(__VA_ARGS__);  \
+    } else {                          \
+        function<false>(__VA_ARGS__); \
+    }                                 \
+
+uint32_t combine_refinement_flags(flags& db_flags) {
+    uint32_t refine_flag = 0;
+    refine_flag |= db_flags.refine_diffuse * REFINE_DIFFUSE;
+    refine_flag |= db_flags.refine_fcell * REFINE_FCELL;
+    refine_flag |= db_flags.refine_eta * REFINE_ETA;
+    refine_flag |= db_flags.refine_Umat[0] * REFINE_UMAT1;
+    refine_flag |= db_flags.refine_Umat[1] * REFINE_UMAT2;
+    refine_flag |= db_flags.refine_Umat[2] * REFINE_UMAT3;
+    refine_flag |= db_flags.refine_Ncells_def * REFINE_NCELLS_DEF;
+    refine_flag |= db_flags.refine_Ncells[0] * REFINE_NCELLS1;
+    refine_flag |= db_flags.refine_Ncells[1] * REFINE_NCELLS2;
+    refine_flag |= db_flags.refine_Ncells[2] * REFINE_NCELLS3;
+    refine_flag |= db_flags.refine_panel_rot[0] * REFINE_PANEL_ROT1;
+    refine_flag |= db_flags.refine_panel_rot[1] * REFINE_PANEL_ROT2;
+    refine_flag |= db_flags.refine_panel_rot[2] * REFINE_PANEL_ROT3;
+    refine_flag |= db_flags.refine_panel_origin[0] * REFINE_PANEL_ORIGIN1;
+    refine_flag |= db_flags.refine_panel_origin[1] * REFINE_PANEL_ORIGIN2;
+    refine_flag |= db_flags.refine_panel_origin[2] * REFINE_PANEL_ORIGIN3;
+    refine_flag |= db_flags.refine_lambda[0] * REFINE_LAMBDA1;
+    refine_flag |= db_flags.refine_lambda[1] * REFINE_LAMBDA2;
+    refine_flag |= db_flags.refine_Bmat[0] * REFINE_BMAT1;
+    refine_flag |= db_flags.refine_Bmat[1] * REFINE_BMAT2;
+    refine_flag |= db_flags.refine_Bmat[2] * REFINE_BMAT3;
+    refine_flag |= db_flags.refine_Bmat[3] * REFINE_BMAT4;
+    refine_flag |= db_flags.refine_Bmat[4] * REFINE_BMAT5;
+    refine_flag |= db_flags.refine_Bmat[5] * REFINE_BMAT6;
+    refine_flag |= db_flags.refine_fp_fdp * REFINE_FP_FDP;
+    refine_flag |= db_flags.refine_Icell * REFINE_ICELL;
+
+    return refine_flag;
+}
+
 void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     int Npix_to_model,
     std::vector<unsigned int>& panels_fasts_slows,
@@ -20,7 +59,7 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     timer_variables& TIMERS) {
     if (db_cryst.phi0 != 0 || db_cryst.phisteps > 1) {
         printf(
-            "PHI (goniometer position) not supported in GPU code: phi0=%f phisteps=%d, phistep=%d\n",
+            "PHI (goniometer position) not supported in GPU code: phi0=%f phisteps=%d, phistep=%f\n",
             db_cryst.phi0, db_cryst.phisteps, db_cryst.phistep);
         exit(-1);
     }
@@ -33,7 +72,7 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     Kokkos::Tools::popRegion();
 
     double time;
-    struct timeval t1, t2;  //, t3 ,t4;
+    struct timeval t1, t;  // t1 times larger blocks of code, and t is used to time shorter blocks of code
     gettimeofday(&t1, 0);
 
     //  determine if we need to allocate pixels, and how many.
@@ -201,18 +240,19 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
 
         resize(m_panels_fasts_slows, db_cu_flags.Npix_to_allocate * 3);
 
+        m_refine_flag = combine_refinement_flags(db_flags);
+        if (m_refine_flag) {
+            resize(m_manager_dI, db_cu_flags.Npix_to_allocate);
+            resize(m_manager_dI2, db_cu_flags.Npix_to_allocate);
+        }
+
         m_npix_allocated = db_cu_flags.Npix_to_allocate;
         Kokkos::Tools::popRegion();
     }  // END of allocation
 
     bool ALLOC = !m_device_is_allocated;  // shortcut variable
 
-    gettimeofday(&t2, 0);
-    time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0;
-    if (TIMERS.recording)
-        TIMERS.cuda_alloc += time;
-    if (db_flags.verbose > 1)
-        printf("TIME SPENT ALLOCATING (TOTAL):  %3.10f ms \n", time);
+    easy_time(TIMERS.cuda_alloc, t1, TIMERS.recording); //, db_flags.verbose > 1);
 
     // ALLOC = false;
     //  BEGIN COPYING DATA
@@ -221,33 +261,36 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
 
     //  END step position
     if (db_flags.Fhkl_gradient_mode){
-        transfer(m_data_residual, d_image.residual, Npix_to_model);
-        transfer(m_data_variance, d_image.variance, Npix_to_model);
-        transfer(m_data_trusted, d_image.trusted, Npix_to_model);
-        transfer(m_data_freq, d_image.freq, Npix_to_model);
+        kokkostbx::transfer_vector2kokkos(m_data_residual, d_image.residual);
+        kokkostbx::transfer_vector2kokkos(m_data_variance, d_image.variance);
+        kokkostbx::transfer_vector2kokkos(m_data_trusted, d_image.trusted);
+        kokkostbx::transfer_vector2kokkos(m_data_freq, d_image.freq);
     }
 
     if (db_flags.Fhkl_have_scale_factors && ALLOC){
-        transfer(m_FhklLinear_ASUid, db_cryst.FhklLinear_ASUid);
+        kokkostbx::transfer_vector2kokkos(m_FhklLinear_ASUid, db_cryst.FhklLinear_ASUid);
     }
 
+    Kokkos::Tools::pushRegion("BEGIN Fhkl have scale factors");
+    gettimeofday(&t, 0);
     if (db_flags.Fhkl_have_scale_factors){
-        //SCITBX_ASSERT(db_beam.number_of_sources == db_beam.Fhkl_channels.size());
-        transfer(m_Fhkl_channels, db_beam.Fhkl_channels);
-        transfer(m_Fhkl_scale, d_image.Fhkl_scale);
+        if (db_cu_flags.update_Fhkl_scales || ALLOC){
+            kokkostbx::transfer_vector2kokkos(m_Fhkl_scale, d_image.Fhkl_scale);
+            db_cu_flags.update_Fhkl_scales = false;
+        }
+        if (db_cu_flags.update_Fhkl_channels || ALLOC){
+            kokkostbx::transfer_vector2kokkos(m_Fhkl_channels, db_beam.Fhkl_channels);
+            db_cu_flags.update_Fhkl_channels = false;
+        }
         ::Kokkos::deep_copy(m_Fhkl_scale_deriv, 0);
-
-        // for (int i=0; i < d_image.Fhkl_scale.size(); i++){
-            // cp.Fhkl_scale[i] = d_image.Fhkl_scale[i];
-            // if (db_flags.Fhkl_gradient_mode){
-            //     cp.Fhkl_scale_deriv[i] = 0;
-            // }
-        // }
     }
+    Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_Fhkl_scale, t, TIMERS.recording); //, db_flags.verbose > 1);
 
     //  BEGIN sources
     Kokkos::Tools::pushRegion("BEGIN sources");
-    if (db_cu_flags.update_sources || ALLOC || FORCE_COPY) {
+    gettimeofday(&t, 0);
+    if (db_cu_flags.update_sources || ALLOC) {
         int source_count = local_beam.number_of_sources;
         kokkostbx::transfer_double2kokkos(m_source_X, local_beam.source_X, source_count);
         kokkostbx::transfer_double2kokkos(m_source_Y, local_beam.source_Y, source_count);
@@ -268,12 +311,15 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
         Kokkos::fence();
         if (db_flags.verbose > 1)
             printf("H2D sources\n");
+        db_cu_flags.update_sources = false;
     }
+    easy_time(TIMERS.copy_sources, t, TIMERS.recording); //, db_flags.verbose > 1);
 
     Kokkos::Tools::popRegion();
     //  END sources
 
     //  UMATS
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("UMATS");
     if (db_cu_flags.update_umats || ALLOC || FORCE_COPY) {
         transfer_KOKKOS_MAT3(m_UMATS, db_cryst.UMATS);
@@ -285,18 +331,24 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             printf("H2D Done copying Umats\n");
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_umats, t, TIMERS.recording); //, db_flags.verbose > 1);
     //  END UMATS
 
+    gettimeofday(&t, 0);
     if (db_cu_flags.update_umats || ALLOC || FORCE_COPY) {
         auto Amat_init = db_cryst.eig_U*db_cryst.eig_B*1e10*(db_cryst.eig_O.transpose());
+        auto host_AMATS = Kokkos::create_mirror_view(m_AMATS);
         for (int i=0; i<db_cryst.UMATS_RXYZ.size(); ++i) {
-            m_AMATS(i) = to_mat3((db_cryst.UMATS_RXYZ[i]*Amat_init).transpose());
+            host_AMATS(i) = to_mat3((db_cryst.UMATS_RXYZ[i]*Amat_init).transpose());
         }
+        Kokkos::deep_copy(m_AMATS, host_AMATS);
         if (db_flags.verbose > 1)
             printf("H2D Done copying Amats\n");
     }
+    easy_time(TIMERS.copy_amats, t, TIMERS.recording);
 
     //  BMATS
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("BMATS");
     if (db_cu_flags.update_dB_mats || ALLOC || FORCE_COPY) {
         transfer_KOKKOS_MAT3(m_dB_Mats, db_cryst.dB_Mats);
@@ -305,9 +357,11 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             printf("H2D Done copying dB_Mats\n");
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_bmats, t, TIMERS.recording);
     //  END BMATS
 
     //  ROT MATS
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("ROT MATS");
     if (db_cu_flags.update_rotmats || ALLOC || FORCE_COPY) {
         transfer_KOKKOS_MAT3(m_RotMats, db_cryst.RotMats);
@@ -317,11 +371,13 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             printf("H2D Done copying rotmats\n");
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_rotmats, t, TIMERS.recording);
     //  END ROT MATS
 
     //  DETECTOR VECTORS
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("DETECTOR VECTORS");
-    if (db_cu_flags.update_detector || ALLOC || FORCE_COPY) {
+    if (db_cu_flags.update_detector || ALLOC) {
         kokkostbx::transfer_vector2kokkos(m_fdet_vectors, local_det.fdet_vectors);
         kokkostbx::transfer_vector2kokkos(m_sdet_vectors, local_det.sdet_vectors);
         kokkostbx::transfer_vector2kokkos(m_odet_vectors, local_det.odet_vectors);
@@ -329,25 +385,30 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
         kokkostbx::transfer_vector2kokkos(m_close_distances, local_det.close_distances);
         if (db_flags.verbose > 1)
             printf("H2D Done copying detector vectors\n");
+        db_cu_flags.update_detector = false;
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_det, t, TIMERS.recording);
     //  END  DETECTOR VECTORS
 
+    gettimeofday(&t, 0);
     if (ALLOC || FORCE_COPY) {
-        transfer(m_nominal_hkl, db_cryst.nominal_hkl);
-        transfer(m_atom_data, db_cryst.atom_data);
+        kokkostbx::transfer_vector2kokkos(m_nominal_hkl, db_cryst.nominal_hkl);
+        kokkostbx::transfer_vector2kokkos(m_atom_data, db_cryst.atom_data);
         if (db_flags.verbose > 1)
             printf("H2D Done copying atom data\n");
 
-        transfer(m_fpfdp, db_cryst.fpfdp);
-        transfer(m_fpfdp_derivs, db_cryst.fpfdp_derivs);
+        kokkostbx::transfer_vector2kokkos(m_fpfdp, db_cryst.fpfdp);
+        kokkostbx::transfer_vector2kokkos(m_fpfdp_derivs, db_cryst.fpfdp_derivs);
         if (db_flags.verbose > 1)
             printf("H2D Done copying fprime and fdblprime\n");
     }
+    easy_time(TIMERS.copy_nomhkl, t, TIMERS.recording);
 
-    //  BEGIN REFINEMENT FLAGS
-    Kokkos::Tools::pushRegion("BEGIN REFINMENT FLAGS");
-    if (db_cu_flags.update_refine_flags || ALLOC || FORCE_COPY) {
+    //  BEGIN UPDATE REFINEMENT
+    gettimeofday(&t, 0);
+    Kokkos::Tools::pushRegion("BEGIN UPDATE REFINMENT");
+    if (db_cu_flags.update_refine_flags || ALLOC) {
         kokkostbx::transfer_vector2kokkos(m_refine_Umat, db_flags.refine_Umat);
         kokkostbx::transfer_vector2kokkos(m_refine_Ncells, db_flags.refine_Ncells);
         kokkostbx::transfer_vector2kokkos(m_refine_panel_origin, db_flags.refine_panel_origin);
@@ -356,35 +417,44 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
         kokkostbx::transfer_vector2kokkos(m_refine_Bmat, db_flags.refine_Bmat);
         if (db_flags.verbose > 1)
             printf("H2D Done copying refinement flags\n");
+        db_cu_flags.update_refine_flags=false;
     }
     Kokkos::Tools::popRegion();
-    //  END REFINEMENT FLAGS
+    easy_time(TIMERS.copy_flags, t, TIMERS.recording);
+    //  END UPDATE REFINEMENT
 
     //  BEGIN Fhkl
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("Begin Fhkl");
-    if (db_cu_flags.update_Fhkl || ALLOC || FORCE_COPY) {
-        transfer(m_Fhkl, db_cryst.FhklLinear);
+    if (db_cu_flags.update_Fhkl || ALLOC) {
+        kokkostbx::transfer_vector2kokkos(m_Fhkl, db_cryst.FhklLinear);
         if (db_flags.complex_miller) {
-            transfer(m_Fhkl2, db_cryst.Fhkl2Linear);
+            kokkostbx::transfer_vector2kokkos(m_Fhkl2, db_cryst.Fhkl2Linear);
         }
         if (db_flags.verbose > 1)
             printf("H2D Done copying step Fhkl\n");
+        db_cu_flags.update_Fhkl = false;
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_fhkl, t, TIMERS.recording);
     //  END Fhkl
 
     //  BEGIN panel derivative vecs
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("BEGIN panel derivative vecs");
-    if (db_cu_flags.update_panel_deriv_vecs || ALLOC || FORCE_COPY) {
+    if (db_cu_flags.update_panel_deriv_vecs || ALLOC) {
         kokkostbx::transfer_vector2kokkos(m_dF_vecs, local_det.dF_vecs);
         kokkostbx::transfer_vector2kokkos(m_dS_vecs, local_det.dS_vecs);
         if (db_flags.verbose > 1)
             printf("H2D Done copying step panel derivative vectors\n");
+        db_cu_flags.update_panel_deriv_vecs=false;
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_detderiv, t, TIMERS.recording);
     //  END panel derivative vecs
 
     //  BEGIN panels fasts slows
+    gettimeofday(&t, 0);
     Kokkos::Tools::pushRegion("BEGIN panels fasts slows");
     if (db_cu_flags.update_panels_fasts_slows || ALLOC || FORCE_COPY) {
         kokkostbx::transfer_vector2kokkos(m_panels_fasts_slows, panels_fasts_slows);
@@ -392,14 +462,10 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             printf("H2D Done copying panels_fasts_slows\n");
     }
     Kokkos::Tools::popRegion();
+    easy_time(TIMERS.copy_pfs, t, TIMERS.recording);
     //  END panels fasts slows
 
-    gettimeofday(&t2, 0);
-    time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0;
-    if (TIMERS.recording)
-        TIMERS.cuda_copy_to_dev += time;
-    if (db_flags.verbose > 1)
-        printf("TIME SPENT COPYING DATA HOST->DEV:  %3.10f ms \n", time);
+    easy_time(TIMERS.cuda_copy_to_dev, t1, TIMERS.recording);
 
     m_device_is_allocated = true;
     ::Kokkos::fence("after copy to device");
@@ -412,61 +478,105 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     if (db_cryst.fpfdp.size() == 0) {
         num_atoms = 0;
     }
-    // int sm_size = number_of_sources*5*sizeof(CUDAREAL);
-    // gpu_sum_over_steps<<<numblocks, blocksize, sm_size >>>(
+
     bool aniso_eta = db_cryst.UMATS_RXYZ.size() != db_cryst.UMATS_RXYZ_prime.size();
     bool use_nominal_hkl = !db_cryst.nominal_hkl.empty();
-    kokkos_sum_over_steps(
-        Npix_to_model, m_panels_fasts_slows, m_floatimage, m_wavelenimage, m_d_Umat_images,
-        m_d2_Umat_images, m_d_Bmat_images, m_d2_Bmat_images, m_d_Ncells_images, m_d2_Ncells_images,
-        m_d_fcell_images, m_d2_fcell_images, m_d_eta_images, m_d2_eta_images, m_d_lambda_images,
-        m_d2_lambda_images, m_d_panel_rot_images, m_d2_panel_rot_images, m_d_panel_orig_images,
-        m_d2_panel_orig_images, m_d_fp_fdp_images, db_steps.Nsteps, db_flags.printout_fpixel,
-        db_flags.printout_spixel, db_flags.printout, db_cryst.default_F, local_det.oversample,
-        db_flags.oversample_omega, local_det.subpixel_size, local_det.pixel_size,
-        local_det.detector_thickstep, local_det.detector_thick, m_close_distances,
-        local_det.detector_attnlen, local_det.detector_thicksteps, local_beam.number_of_sources,
-        db_cryst.phisteps, db_cryst.UMATS.size(), db_flags.use_lambda_coefficients,
-        local_beam.lambda0, local_beam.lambda1, to_mat3(db_cryst.eig_U), to_mat3(db_cryst.eig_O),
-        to_mat3(db_cryst.eig_B), to_mat3(db_cryst.RXYZ), m_dF_vecs, m_dS_vecs, m_UMATS_RXYZ, m_UMATS_RXYZ_prime,
-        m_UMATS_RXYZ_dbl_prime, m_RotMats, m_dRotMats, m_d2RotMats, m_UMATS, m_dB_Mats, m_dB2_Mats,
-        m_AMATS, m_source_X, m_source_Y, m_source_Z, m_source_lambda, m_source_I,
-        local_beam.kahn_factor, db_cryst.Na, db_cryst.Nb, db_cryst.Nc, db_cryst.Nd,
-        db_cryst.Ne, db_cryst.Nf, db_cryst.phi0, db_cryst.phistep,
-        to_vec3(db_cryst.spindle_vec), local_beam.polarization_axis, db_cryst.h_range,
-        db_cryst.k_range, db_cryst.l_range, db_cryst.h_max, db_cryst.h_min,
-        db_cryst.k_max, db_cryst.k_min, db_cryst.l_max, db_cryst.l_min,
-        db_cryst.dmin, db_cryst.fudge, db_flags.complex_miller, db_flags.verbose,
-        db_flags.only_save_omega_kahn, db_flags.isotropic_ncells, db_flags.compute_curvatures,
-        m_Fhkl, m_Fhkl2, m_refine_Bmat, m_refine_Ncells, db_flags.refine_Ncells_def,
-        m_refine_panel_origin, m_refine_panel_rot, db_flags.refine_fcell, m_refine_lambda,
-        db_flags.refine_eta, m_refine_Umat, m_fdet_vectors, m_sdet_vectors, m_odet_vectors,
-        m_pix0_vectors, db_flags.nopolar, db_flags.point_pixel, local_beam.fluence,
-        db_cryst.r_e_sqr, db_cryst.spot_scale, Npanels, aniso_eta, db_flags.no_Nabc_scale,
-        m_fpfdp, m_fpfdp_derivs, m_atom_data, num_atoms, db_flags.refine_fp_fdp, m_nominal_hkl,
-        use_nominal_hkl, to_mat3(db_cryst.anisoU), to_mat3(db_cryst.anisoG), to_mat3(db_cryst.rotate_principal_axes),
-        db_flags.use_diffuse, m_d_diffuse_gamma_images, m_d_diffuse_sigma_images, db_flags.refine_diffuse,
-        db_flags.gamma_miller_units, db_flags.refine_Icell, db_flags.wavelength_img,
-        db_cryst.laue_group_num, db_cryst.stencil_size, db_flags.Fhkl_gradient_mode,
-        db_flags.Fhkl_errors_mode, db_flags.using_trusted_mask, db_beam.Fhkl_channels.empty(),
-        db_flags.Fhkl_have_scale_factors, db_cryst.Num_ASU,
-        m_data_residual, m_data_variance,
-        m_data_freq, m_data_trusted,
-        m_FhklLinear_ASUid,
-        m_Fhkl_channels,
-        m_Fhkl_scale, m_Fhkl_scale_deriv
-        );
+    if ((db_flags.printout==false) &&
+       (db_flags.complex_miller==false) &&
+       (db_flags.compute_curvatures==false) &&
+       (m_refine_flag==REFINE_FCELL) &&
+       (db_flags.use_diffuse==false) &&
+       (db_flags.wavelength_img==false) &&
+       (db_flags.Fhkl_gradient_mode==false) &&
+       (db_flags.Fhkl_errors_mode==false) &&
+       (db_flags.using_trusted_mask==false) &&
+       (db_beam.Fhkl_channels.empty()==true) &&
+       (db_flags.Fhkl_have_scale_factors)==false) {
+        kokkos_sum_over_steps<false, false, false, REFINE_FCELL, false, false, false, false, false, true, false>(
+            Npix_to_model, m_panels_fasts_slows, m_floatimage, m_wavelenimage, m_d_Umat_images,
+            m_d2_Umat_images, m_d_Bmat_images, m_d2_Bmat_images, m_d_Ncells_images, m_d2_Ncells_images,
+            m_d_fcell_images, m_d2_fcell_images, m_d_eta_images, m_d2_eta_images, m_d_lambda_images,
+            m_d2_lambda_images, m_d_panel_rot_images, m_d2_panel_rot_images, m_d_panel_orig_images,
+            m_d2_panel_orig_images, m_d_fp_fdp_images, m_manager_dI, m_manager_dI2, db_steps.Nsteps,
+            db_flags.printout_fpixel, db_flags.printout_spixel, /*db_flags.printout,*/ db_cryst.default_F,
+            local_det.oversample, db_flags.oversample_omega, local_det.subpixel_size, local_det.pixel_size,
+            local_det.detector_thickstep, local_det.detector_thick, m_close_distances,
+            local_det.detector_attnlen, local_det.detector_thicksteps, local_beam.number_of_sources,
+            db_cryst.phisteps, (int) db_cryst.UMATS.size(), db_flags.use_lambda_coefficients,
+            local_beam.lambda0, local_beam.lambda1, to_mat3(db_cryst.eig_U), to_mat3(db_cryst.eig_O),
+            to_mat3(db_cryst.eig_B), to_mat3(db_cryst.RXYZ), m_dF_vecs, m_dS_vecs, m_UMATS_RXYZ, m_UMATS_RXYZ_prime,
+            m_UMATS_RXYZ_dbl_prime, m_RotMats, m_dRotMats, m_d2RotMats, m_UMATS, m_dB_Mats, m_dB2_Mats,
+            m_AMATS, m_source_X, m_source_Y, m_source_Z, m_source_lambda, m_source_I,
+            local_beam.kahn_factor, db_cryst.Na, db_cryst.Nb, db_cryst.Nc, db_cryst.Nd,
+            db_cryst.Ne, db_cryst.Nf, db_cryst.phi0, db_cryst.phistep,
+            to_vec3(db_cryst.spindle_vec), local_beam.polarization_axis, db_cryst.h_range,
+            db_cryst.k_range, db_cryst.l_range, db_cryst.h_max, db_cryst.h_min,
+            db_cryst.k_max, db_cryst.k_min, db_cryst.l_max, db_cryst.l_min,
+            db_cryst.dmin, db_cryst.fudge, /*db_flags.complex_miller,*/ db_flags.verbose,
+            db_flags.only_save_omega_kahn, db_flags.isotropic_ncells, /*db_flags.compute_curvatures,*/
+            m_Fhkl, m_Fhkl2, /*m_refine_flag,*/
+            m_fdet_vectors, m_sdet_vectors, m_odet_vectors,
+            m_pix0_vectors, db_flags.nopolar, db_flags.point_pixel, local_beam.fluence,
+            db_cryst.r_e_sqr, db_cryst.spot_scale, Npanels, aniso_eta, db_flags.no_Nabc_scale,
+            m_fpfdp, m_fpfdp_derivs, m_atom_data, num_atoms, m_nominal_hkl,
+            use_nominal_hkl, to_mat3(db_cryst.anisoU), to_mat3(db_cryst.anisoG), to_mat3(db_cryst.rotate_principal_axes),
+            /*db_flags.use_diffuse,*/ m_d_diffuse_gamma_images, m_d_diffuse_sigma_images,
+            db_flags.gamma_miller_units, /*db_flags.wavelength_img,*/
+            db_cryst.laue_group_num, db_cryst.stencil_size, /*db_flags.Fhkl_gradient_mode,*/
+            /*db_flags.Fhkl_errors_mode,*/ /*db_flags.using_trusted_mask,*/ /*db_beam.Fhkl_channels.empty(),*/
+            /*db_flags.Fhkl_have_scale_factors,*/ db_cryst.Num_ASU,
+            m_data_residual, m_data_variance,
+            m_data_freq, m_data_trusted,
+            m_FhklLinear_ASUid,
+            m_Fhkl_channels,
+            m_Fhkl_scale, m_Fhkl_scale_deriv);
+    } else {
+        kokkos_sum_over_steps(
+            Npix_to_model, m_panels_fasts_slows, m_floatimage, m_wavelenimage, m_d_Umat_images,
+            m_d2_Umat_images, m_d_Bmat_images, m_d2_Bmat_images, m_d_Ncells_images, m_d2_Ncells_images,
+            m_d_fcell_images, m_d2_fcell_images, m_d_eta_images, m_d2_eta_images, m_d_lambda_images,
+            m_d2_lambda_images, m_d_panel_rot_images, m_d2_panel_rot_images, m_d_panel_orig_images,
+            m_d2_panel_orig_images, m_d_fp_fdp_images, m_manager_dI, m_manager_dI2, db_steps.Nsteps,
+            db_flags.printout_fpixel, db_flags.printout_spixel, db_flags.printout, db_cryst.default_F,
+            local_det.oversample, db_flags.oversample_omega, local_det.subpixel_size, local_det.pixel_size,
+            local_det.detector_thickstep, local_det.detector_thick, m_close_distances,
+            local_det.detector_attnlen, local_det.detector_thicksteps, local_beam.number_of_sources,
+            db_cryst.phisteps, db_cryst.UMATS.size(), db_flags.use_lambda_coefficients,
+            local_beam.lambda0, local_beam.lambda1, to_mat3(db_cryst.eig_U), to_mat3(db_cryst.eig_O),
+            to_mat3(db_cryst.eig_B), to_mat3(db_cryst.RXYZ), m_dF_vecs, m_dS_vecs, m_UMATS_RXYZ, m_UMATS_RXYZ_prime,
+            m_UMATS_RXYZ_dbl_prime, m_RotMats, m_dRotMats, m_d2RotMats, m_UMATS, m_dB_Mats, m_dB2_Mats,
+            m_AMATS, m_source_X, m_source_Y, m_source_Z, m_source_lambda, m_source_I,
+            local_beam.kahn_factor, db_cryst.Na, db_cryst.Nb, db_cryst.Nc, db_cryst.Nd,
+            db_cryst.Ne, db_cryst.Nf, db_cryst.phi0, db_cryst.phistep,
+            to_vec3(db_cryst.spindle_vec), local_beam.polarization_axis, db_cryst.h_range,
+            db_cryst.k_range, db_cryst.l_range, db_cryst.h_max, db_cryst.h_min,
+            db_cryst.k_max, db_cryst.k_min, db_cryst.l_max, db_cryst.l_min,
+            db_cryst.dmin, db_cryst.fudge, db_flags.complex_miller, db_flags.verbose,
+            db_flags.only_save_omega_kahn, db_flags.isotropic_ncells, db_flags.compute_curvatures,
+            m_Fhkl, m_Fhkl2, m_refine_flag,
+            m_fdet_vectors, m_sdet_vectors, m_odet_vectors,
+            m_pix0_vectors, db_flags.nopolar, db_flags.point_pixel, local_beam.fluence,
+            db_cryst.r_e_sqr, db_cryst.spot_scale, Npanels, aniso_eta, db_flags.no_Nabc_scale,
+            m_fpfdp, m_fpfdp_derivs, m_atom_data, num_atoms, m_nominal_hkl,
+            use_nominal_hkl, to_mat3(db_cryst.anisoU), to_mat3(db_cryst.anisoG), to_mat3(db_cryst.rotate_principal_axes),
+            db_flags.use_diffuse, m_d_diffuse_gamma_images, m_d_diffuse_sigma_images,
+            db_flags.gamma_miller_units, db_flags.wavelength_img,
+            db_cryst.laue_group_num, db_cryst.stencil_size, db_flags.Fhkl_gradient_mode,
+            db_flags.Fhkl_errors_mode, db_flags.using_trusted_mask, db_beam.Fhkl_channels.empty(),
+            db_flags.Fhkl_have_scale_factors, db_cryst.Num_ASU,
+            m_data_residual, m_data_variance,
+            m_data_freq, m_data_trusted,
+            m_FhklLinear_ASUid,
+            m_Fhkl_channels,
+            m_Fhkl_scale, m_Fhkl_scale_deriv
+            );
+    }
 
     ::Kokkos::fence("after kernel call");
 
     if (db_flags.verbose > 1)
         printf("KERNEL_COMPLETE gpu_sum_over_steps\n");
-    gettimeofday(&t2, 0);
-    time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0;
-    if (TIMERS.recording)
-        TIMERS.cuda_kernel += time;
-    if (db_flags.verbose > 1)
-        printf("TIME SPENT(KERNEL):  %3.10f ms \n", time);
+    easy_time(TIMERS.cuda_kernel, t1, TIMERS.recording);
 
     gettimeofday(&t1, 0);
     //  COPY BACK FROM DEVICE
@@ -482,12 +592,10 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     }
     if (db_flags.Fhkl_gradient_mode){
         if (db_flags.Fhkl_errors_mode){
-            for (int i=0; i < d_image.Fhkl_hessian.size(); i++)
-                d_image.Fhkl_hessian[i]= m_Fhkl_scale_deriv(i);
+            kokkostbx::transfer_kokkos2vector(d_image.Fhkl_hessian, m_Fhkl_scale_deriv);
         }
         else{
-            for (int i=0; i < d_image.Fhkl_scale_deriv.size(); i++)
-                d_image.Fhkl_scale_deriv[i]= m_Fhkl_scale_deriv(i);
+            kokkostbx::transfer_kokkos2vector(d_image.Fhkl_scale_deriv, m_Fhkl_scale_deriv);
         }
     }
     if (std::count(db_flags.refine_Umat.begin(), db_flags.refine_Umat.end(), true) > 0) {
@@ -526,12 +634,7 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     }
 
     Kokkos::Tools::popRegion();
-    gettimeofday(&t2, 0);
-    time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0;
-    if (TIMERS.recording)
-        TIMERS.cuda_copy_from_dev += time;
-    if (db_flags.verbose > 1)
-        printf("TIME SPENT COPYING BACK :  %3.10f ms \n", time);
+    easy_time(TIMERS.cuda_copy_from_dev, t1, TIMERS.recording);
     ::Kokkos::fence("After copy to host");
 
     Kokkos::Tools::popRegion();
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.h b/simtbx/diffBragg/src/diffBraggKOKKOS.h
index cc074b938a..3aa07ae27b 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.h
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.h
@@ -9,9 +9,11 @@
 #include "kokkostbx/kokkos_utils.h"
 #include "simtbx/diffBragg/src/util.h"
 #include "simtbx/diffBragg/src/util_kokkos.h"
+#include "simtbx/diffBragg/src/diffBragg_refine_flag.h"
 
 using vector_vec3_t = view_1d_t<KOKKOS_VEC3>;
 using vector_mat3_t = view_1d_t<KOKKOS_MAT3>;
+using vector_manager_t = view_1d_t<kokkos_manager>;
 
 #define INTEGER_VIEW(varname) vector_int_t varname = vector_int_t(#varname, 0)
 #define CUDAREAL_VIEW(varname) vector_cudareal_t varname = vector_cudareal_t(#varname, 0)
@@ -101,6 +103,7 @@ class diffBraggKOKKOS {
     MATRIX3_VIEW(m_sausages_U);
     CUDAREAL_VIEW(m_sausages_scale);
 
+    uint32_t m_refine_flag = 0;
     vector_bool_t m_refine_Bmat = vector_bool_t("m_refine_Bmat", 6);
     vector_bool_t m_refine_Umat = vector_bool_t("m_refine_Umat", 3);
     vector_bool_t m_refine_Ncells = vector_bool_t("m_refine_Ncells", 3);
@@ -108,6 +111,9 @@ class diffBraggKOKKOS {
     vector_bool_t m_refine_panel_rot = vector_bool_t("m_refine_panel_rot", 3);
     vector_bool_t m_refine_lambda = vector_bool_t("m_refine_lambda", 2);
 
+    vector_manager_t m_manager_dI = vector_manager_t("m_manager_dI", 0);
+    vector_manager_t m_manager_dI2 = vector_manager_t("m_manager_dI2", 0);
+
     bool m_Fhkl_gradient_mode;
     bool m_using_trusted_mask;
     bool m_Fhkl_channels_empty;
@@ -119,12 +125,13 @@ class diffBraggKOKKOS {
     INTEGER_VIEW(m_data_freq); // length is number of modeled pixels
     vector_bool_t m_data_trusted = vector_bool_t("m_data_trusted", 0); // length is number of modeled pixels
     INTEGER_VIEW(m_FhklLinear_ASUid); // length is number of ASU in FhklLinear
-    CUDAREAL_VIEW(m_Fhkl_channels);
+    INTEGER_VIEW(m_Fhkl_channels);
     // Fhkl_scale is dynamically copied each iteration
     // Fhkl_scale_deriv is set to 0 each iteration
     CUDAREAL_VIEW(m_Fhkl_scale);  // length is (number of ASUin FhklLinear) *times* (number of Fhkl channels)
     CUDAREAL_VIEW(m_Fhkl_scale_deriv); // length is (number of ASUin FhklLinear) *times* (number of Fhkl channels)
 
+
    public:
     void diffBragg_sum_over_steps_kokkos(
         int Npix_to_model,
diff --git a/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp b/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
index decf9f9145..22dd70275a 100644
--- a/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
+++ b/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
@@ -311,6 +311,10 @@ void diffBragg_sum_over_steps(
       anisoG_local = db_cryst.anisoG;
       anisoU_local = db_cryst.anisoU;
 
+      if (laue_group_num < 1 || laue_group_num >14 ){
+        throw std::string("Laue group number not in range 1-14");
+      }
+
       num_laue_mats = gen_laue_mats(laue_group_num, laue_mats, db_cryst.rotate_principal_axes);
       for (int i_gam=0; i_gam<3; i_gam++){
         dG_dgam[i_gam] << 0,0,0,0,0,0,0,0,0;
@@ -913,6 +917,7 @@ void diffBragg_sum_over_steps(
                 int nom_l=l0;
                 //int f_cell_idx = 1;
                 if (use_nominal_hkl){
+
                     nom_h = db_cryst.nominal_hkl[i_pix*3];
                     nom_k = db_cryst.nominal_hkl[i_pix*3+1];
                     nom_l = db_cryst.nominal_hkl[i_pix*3+2];
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 7d2ba180f1..997a6bea6d 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -23,6 +23,43 @@ namespace boost_python { namespace {
       return indices;
   }
 
+  static void set_beams(simtbx::nanoBragg::diffBragg& diffBragg, scitbx::af::versa<dxtbx::model::Beam, scitbx::af::flex_grid<> > const& value) {
+      if(diffBragg.verbose>3) printf(" about to initialize sources\n");
+      diffBragg.pythony_beams = value;
+      if(diffBragg.verbose>3) printf(" done\n");
+      diffBragg.db_cu_flags.update_sources=true;
+      /* re-initialize source table from pythony array */
+      diffBragg.init_sources();
+  }
+  // TODO: point to the get_beams defined in simtbx/nanoBragg/nanoBragg_ext.cpp if possible..
+  ///* table of sources, as dxtbx "beam"s */
+  static scitbx::af::versa<dxtbx::model::Beam, scitbx::af::flex_grid<> > get_beams(simtbx::nanoBragg::diffBragg& diffBragg) {
+      int i;
+      /* allocate new flex array */
+//      scitbx::af::versa<dxtbx::model::Beam, scitbx::af::flex_grid<> > diffBragg_pythony_beams;
+      diffBragg.pythony_beams = scitbx::af::versa<dxtbx::model::Beam, scitbx::af::flex_grid<> >();
+      /* make sure it is big enough to hold all sources */
+      diffBragg.pythony_beams.resize(diffBragg.sources);
+
+      /* polarization normal seems to be B vector */
+      scitbx::vec3<double> Evector = scitbx::vec3<double>(diffBragg.polar_vector[1],diffBragg.polar_vector[2],diffBragg.polar_vector[3]);
+      scitbx::vec3<double> Pvector = scitbx::vec3<double>(diffBragg.beam_vector[1],diffBragg.beam_vector[2],diffBragg.beam_vector[3]);
+      scitbx::vec3<double> Bvector = Pvector.cross(Evector).normalize();
+
+      /* copy internal storage into the flex array */
+      for(i=0;i<diffBragg.sources;++i){
+          diffBragg.pythony_beams[i].set_direction(scitbx::vec3<double>(diffBragg.source_X[i],diffBragg.source_Y[i],diffBragg.source_Z[i]));
+          diffBragg.pythony_beams[i].set_wavelength(diffBragg.source_lambda[i]*1e10);
+          diffBragg.pythony_beams[i].set_flux(diffBragg.source_I[i]);
+          // how is this a fraction when it can be negative? (Kahn et al. 1982)
+          diffBragg.pythony_beams[i].set_polarization_fraction(diffBragg.polarization);
+          diffBragg.pythony_beams[i].set_polarization_normal(Bvector);
+      }
+      /* pass this back to python */
+      return diffBragg.pythony_beams;
+  }
+
+
   void set_dspace_bins(simtbx::nanoBragg::diffBragg& diffBragg, boost::python::list bins){
     diffBragg.db_cryst.dspace_bins.clear();
     for (int i=0; i< boost::python::len(bins); i++ ){
@@ -155,7 +192,7 @@ namespace boost_python { namespace {
     values = boost::python::make_tuple(0,0);
     return values;
   }
-  //TODO override the set_sources function (or xray_beams) property in nanoBragg in order
+  //TODO override the set_sources function (or xray_beams) property in nanoBragg
   // to set the fpfdp accordingly (if Fhkl2 is set)
   static void set_atom_data(simtbx::nanoBragg::diffBragg & diffBragg,
             boost::python::tuple const& atom_XYZBO){
@@ -418,6 +455,7 @@ namespace boost_python { namespace {
 
   static void  set_Fhkl_tuple(simtbx::nanoBragg::diffBragg& diffBragg, boost::python::tuple const& value) {
       //TODO nanoBragg set as well ?
+      diffBragg.db_cu_flags.update_Fhkl=true;
       diffBragg.pythony_indices = extract<nanoBragg::indices >(value[0]);
       diffBragg.pythony_amplitudes = extract<nanoBragg::af::shared<double> >(value[1]);
       diffBragg.init_Fhkl();
@@ -432,7 +470,7 @@ namespace boost_python { namespace {
       diffBragg.linearize_Fhkl(true);
   }
 
-  static boost::python::tuple get_Fhkl_tuple(simtbx::nanoBragg::diffBragg diffBragg) {
+  static boost::python::tuple get_Fhkl_tuple(nanoBragg::diffBragg diffBragg) {
       int h,k,l;
       double temp;
       int hkls = diffBragg.h_range*diffBragg.k_range*diffBragg.l_range;
@@ -471,9 +509,8 @@ namespace boost_python { namespace {
   }
 
   void initialize_kokkos(int dev){
-    Kokkos::InitArguments kokkos_init;
-    kokkos_init.device_id = dev;
-    Kokkos::initialize(kokkos_init);
+    Kokkos::initialize(Kokkos::InitializationSettings()
+                           .set_device_id(dev));
   }
 #endif
 
@@ -628,12 +665,14 @@ namespace boost_python { namespace {
 
       .def("show_heavy_atom_data", &simtbx::nanoBragg::diffBragg::show_heavy_atom_data)
 
+      .def("gpu_free",&simtbx::nanoBragg::diffBragg::gpu_free)
+
 #ifdef DIFFBRAGG_HAVE_CUDA
       .def("gpu_free",&simtbx::nanoBragg::diffBragg::cuda_free)
 #endif
 
 #ifdef DIFFBRAGG_HAVE_KOKKOS
-      .def("kokkos_gpu_free",&simtbx::nanoBragg::diffBragg::kokkos_free)
+      .def("kokkos_free",&simtbx::nanoBragg::diffBragg::kokkos_free)
 #endif
 
       .def("set_mosaic_blocks_prime",
@@ -789,9 +828,9 @@ namespace boost_python { namespace {
              "coefficients for source_lambda refinement: `lambda = coef0 + coef1*source`  where `source` is the source index")
 
      // CUDA PROPERTIES
-      .add_property("use_cuda",
-             make_getter(&simtbx::nanoBragg::diffBragg::use_cuda,rbv()),
-             make_setter(&simtbx::nanoBragg::diffBragg::use_cuda,dcp()),
+      .add_property("use_gpu",
+             make_getter(&simtbx::nanoBragg::diffBragg::use_gpu,rbv()),
+             make_setter(&simtbx::nanoBragg::diffBragg::use_gpu,dcp()),
              "use GPU acceleration")
 
       .add_property("record_time",
@@ -886,6 +925,15 @@ namespace boost_python { namespace {
             &simtbx::nanoBragg::diffBragg::set_Friedel_mate_inds,
             "Two arguments; each lists of the same length, pointing to the positive and negative mates in a Friedel pair, respectively")
 
+      .def("get_mosaic_blocks_prime",
+           &simtbx::nanoBragg::diffBragg::get_mosaic_blocks_prime,
+           "return the deriv of the matrices U that define the mosaic block distribution w.r.t eta")
+
+      .add_property("xray_beams",
+                    make_function(&get_beams,rbv()),
+                    make_function(&set_beams,dcp()),
+                    "list of dxtbx::Beam objects corresponding to each zero-divergence and monochromatic x-ray point source in the numerical simulation ")
+
     ; // end of diffBragg extention
 
   } // end of diffBragg_init_module
diff --git a/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu b/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
index 9c382a5c60..6bf8b801e4 100644
--- a/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
+++ b/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
@@ -465,7 +465,7 @@ void gpu_sum_over_steps(
 
             CUDAREAL _F_cell = s_default_F;
             CUDAREAL _F_cell2 = 0;
-           int i_hklasu=0;
+            int i_hklasu=0;
 
             if ( (_h0<=s_h_max) && (_h0>=s_h_min) && (_k0<=s_k_max) && (_k0>=s_k_min) && (_l0<=s_l_max) && (_l0>=s_l_min)  ) {
                 int Fhkl_linear_index = (_h0-s_h_min) * s_k_range * s_l_range + (_k0-s_k_min) * s_l_range + (_l0-s_l_min);
diff --git a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
index 69750ad9be..5b1acf243f 100644
--- a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
+++ b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
@@ -2,11 +2,6 @@
 #include "diffBraggKOKKOS.h"
 #include <simtbx/diffBragg/src/diffuse_util_kokkos.h>
 
-// using ::Kokkos::Experimental::exp;
-// using ::Kokkos::Experimental::sin;
-// using ::Kokkos::Experimental::cos;
-// using ::Kokkos::Experimental::sqrt;
-
 void kokkos_sum_over_steps(
     int Npix_to_model,
     vector_uint_t panels_fasts_slows,
@@ -29,6 +24,8 @@ void kokkos_sum_over_steps(
     vector_cudareal_t d_panel_orig_images,
     vector_cudareal_t d2_panel_orig_images,
     vector_cudareal_t d_fp_fdp_images,
+    vector_manager_t manager_dI,
+    vector_manager_t manager_dI2,
     const int Nsteps,
     int printout_fpixel,
     int printout_spixel,
@@ -99,15 +96,16 @@ void kokkos_sum_over_steps(
     bool compute_curvatures,
     const vector_cudareal_t FhklLinear,
     const vector_cudareal_t Fhkl2Linear,
-    vector_bool_t refine_Bmat,
-    vector_bool_t refine_Ncells,
-    bool refine_Ncells_def,
-    vector_bool_t refine_panel_origin,
-    vector_bool_t refine_panel_rot,
-    bool refine_fcell,
-    vector_bool_t refine_lambda,
-    bool refine_eta,
-    vector_bool_t refine_Umat,
+    const uint32_t refine_flag,
+    // vector_bool_t refine_Bmat,
+    // vector_bool_t refine_Ncells,
+    // bool refine_Ncells_def,
+    // vector_bool_t refine_panel_origin,
+    // vector_bool_t refine_panel_rot,
+    // bool refine_fcell,
+    // vector_bool_t refine_lambda,
+    // bool refine_eta,
+    // vector_bool_t refine_Umat,
     const vector_cudareal_t fdet_vectors,
     const vector_cudareal_t sdet_vectors,
     const vector_cudareal_t odet_vectors,
@@ -124,7 +122,7 @@ void kokkos_sum_over_steps(
     const vector_cudareal_t fpfdp_derivs,
     const vector_cudareal_t atom_data,
     int num_atoms,
-    bool refine_fp_fdp,
+    // bool refine_fp_fdp,
     const vector_int_t nominal_hkl,
     bool use_nominal_hkl,
     KOKKOS_MAT3 anisoU,
@@ -133,9 +131,9 @@ void kokkos_sum_over_steps(
     bool use_diffuse,
     vector_cudareal_t d_diffuse_gamma_images,
     vector_cudareal_t d_diffuse_sigma_images,
-    bool refine_diffuse,
+    // bool refine_diffuse,
     bool gamma_miller_units,
-    bool refine_Icell,
+    // bool refine_Icell,
     bool save_wavelenimage,
     int laue_group_num,
     int stencil_size,
@@ -150,91 +148,10 @@ void kokkos_sum_over_steps(
     const vector_int_t data_freq,
     const vector_bool_t data_trusted,
     const vector_int_t FhklLinear_ASUid,
-    const vector_cudareal_t Fhkl_channels,
+    const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
     vector_cudareal_t Fhkl_scale_deriv) {  // BEGIN GPU kernel
 
-    // int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    // int thread_stride = blockDim.x * gridDim.x;
-    // __shared__ bool s_Fhkl_channels_empty;
-    // __shared__ bool s_Fhkl_have_scale_factors;
-    // __shared__ bool s_Fhkl_gradient_mode;
-    // __shared__ bool s_Fhkl_errors_mode;
-    // __shared__ int s_Num_ASU;
-    // __shared__ bool s_refine_Icell;
-    // __shared__ bool s_use_diffuse;
-    // __shared__ bool s_use_nominal_hkl;
-    // __shared__ bool s_refine_fp_fdp;
-    // __shared__ bool s_complex_miller;
-    // __shared__ int s_num_atoms;
-    // __shared__ bool s_aniso_eta;
-    // __shared__ bool s_no_Nabc_scale;
-    // __shared__ bool s_compute_curvatures;
-    // __shared__ KOKKOS_MAT3 s_Ot;
-    // __shared__ bool s_refine_diffuse;
-    // __shared__ bool s_gamma_miller_units;
-    // __shared__ KOKKOS_MAT3 _NABC;
-    // __shared__ KOKKOS_MAT3 s_dN;
-    // __shared__ CUDAREAL C;
-    // __shared__ CUDAREAL two_C;
-    // __shared__ KOKKOS_MAT3 Bmat_realspace;
-    // __shared__ KOKKOS_MAT3 Amat_init;
-    //__shared__ CUDAREAL s_Na;
-    //__shared__ CUDAREAL s_Nb;
-    //__shared__ CUDAREAL s_Nc;
-    // __shared__ CUDAREAL s_NaNbNc_squared; // unused?
-    // __shared__ int s_h_max, s_k_max, s_l_max, s_h_min, s_k_min, s_l_min, s_k_range, s_l_range;
-    // __shared__ int s_sources, s_mosaic_domains;
-    // __shared__ CUDAREAL s_detector_attnlen, s_lambda0, s_lambda1;
-    // __shared__ bool s_printout;
-    // __shared__ KOKKOS_VEC3 s_polarization_axis;
-
-    // __shared__ bool s_refine_Umat[3];
-    // __shared__ bool s_refine_panel_origin[3];
-    // __shared__ bool s_refine_panel_rot[3];
-    // __shared__ bool s_refine_Ncells[3];
-    // __shared__ bool s_refine_eta;
-    // __shared__ bool s_refine_Ncells_def;
-    // __shared__ bool s_refine_fcell;
-    // __shared__ bool s_refine_Bmat[6];
-    // __shared__ bool s_refine_lambda[2];
-    // __shared__ double s_NABC_det, s_NABC_det_sq;
-    // extern __shared__ CUDAREAL det_vecs[];
-    //__shared__ int det_stride;
-
-    // TODO can we get speed gains by dividing up the following definitions over more threads ?
-    // if (threadIdx.x == 0) {
-    // for (int i = 0; i < 3; i++) {
-    // s_refine_Ncells[i] = refine_Ncells[i];
-    // s_refine_Umat[i] = refine_Umat[i];
-    // s_refine_panel_origin[i] = refine_panel_origin[i];
-    // s_refine_panel_rot[i] = refine_panel_rot[i];
-    // }
-    // s_Fhkl_channels_empty = Fhkl_channels_empty;
-    // s_Fhkl_have_scale_factors = Fhkl_have_scale_factors;
-    // s_Fhkl_gradient_mode = Fhkl_gradient_mode;
-    // s_Fhkl_errors_mode = Fhkl_errors_mode;
-    // s_Num_ASU = Num_ASU;
-    // s_refine_Icell = refine_Icell;
-    // s_use_nominal_hkl = use_nominal_hkl;
-    // s_aniso_eta = aniso_eta;
-    // s_no_Nabc_scale = no_Nabc_scale;
-    // s_complex_miller = complex_miller;
-    // s_refine_lambda[0] = refine_lambda[0];
-    // s_refine_lambda[1] = refine_lambda[1];
-    // for (int i = 0; i < 6; i++) {
-    //     s_refine_Bmat[i] = refine_Bmat[i];
-    // }
-    // s_use_diffuse = use_diffuse;
-    // s_num_atoms = num_atoms;
-    // s_refine_fcell = refine_fcell;
-    // s_refine_eta = refine_eta;
-    // s_refine_Ncells_def = refine_Ncells_def;
-    // s_compute_curvatures = compute_curvatures;
-    // s_refine_fp_fdp = refine_fp_fdp;
-    // s_refine_diffuse = refine_diffuse;
-    // s_gamma_miller_units = gamma_miller_units;
-
     const KOKKOS_MAT3 Bmat_realspace = eig_B * 1e10;
     const KOKKOS_MAT3 eig_Otranspose = eig_O.transpose();
     const KOKKOS_MAT3 Amat_init = eig_U * Bmat_realspace * eig_Otranspose;
@@ -245,1027 +162,2325 @@ void kokkos_sum_over_steps(
     const CUDAREAL C = 2 / 0.63 * fudge;
     const CUDAREAL two_C = 2 * C;
     KOKKOS_MAT3 anisoG_local;
+    CUDAREAL anisoG_determ = 0;
     KOKKOS_MAT3 anisoU_local;
-    KOKKOS_MAT3 laue_mats[24];
-    KOKKOS_MAT3 dG_dgam[3];
+    vector_mat3_t laue_mats = vector_mat3_t("laue_mats", 24);
+    vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam", 3);
+    vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace", 3);
     int num_laue_mats = 0;
     int dhh = 0, dkk = 0, dll = 0;
-    KOKKOS_VEC3 Hmin, Hmax, dHH, Hrange;
-    // s_Na = Na;
-    // s_Nb = Nb;
-    // s_Nc = Nc;
-    // s_NaNbNc_squared = (Na * Nb * Nc);
-    // s_NaNbNc_squared *= s_NaNbNc_squared;
-    // s_h_max = h_max;
-    // s_k_max = k_max;
-    // s_l_max = l_max;
-    // s_h_min = h_min;
-    // s_k_min = k_min;
-    // s_l_min = l_min;
-    // s_h_range = h_range;
-    // s_k_range = k_range;
-    // s_l_range = l_range;
-
-    // s_oversample = oversample;
-    // s_detector_thicksteps = detector_thicksteps;
-    // s_sources = sources;
-    // s_mosaic_domains = mosaic_domains;
-    // s_detector_thickstep = detector_thickstep;
-    // s_detector_attnlen = detector_attnlen;
-    // s_subpixel_size = subpixel_size;
-    // s_pixel_size = pixel_size;
-    // s_detector_thick = _detector_thick;
-    // s_lambda0 = lambda0;
-    // s_lambda1 = lambda1;
-    // s_oversample_omega = _oversample_omega;
-    // s_printout = printout;
-    // s_printout_fpixel = printout_fpixel;
-    // s_printout_spixel = printout_spixel;
-    // s_default_F = default_F;
-    // s_verbose = verbose;
-    // s_polarization_axis = polarization_axis;
-    // s_kahn_factor = kahn_factor;
-    // s_nopolar = nopolar;
-    // sX0 = source_X[0];
-    // sY0 = source_Y[0];
-    // sZ0 = source_Z[0];
-    // s_Nsteps = Nsteps;
-
-    // }
+
+    Kokkos::View<KOKKOS_MAT3*[3]> UMATS_prime("UMATS_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[3]> UMATS_dbl_prime("UMATS_dbl_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[6]> BMATS_prime("BMATS_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[6]> BMATS_dbl_prime("BMATS_dbl_prime", mosaic_domains);
+
+    Kokkos::parallel_for("prepare_UMATS", mosaic_domains, KOKKOS_LAMBDA(const int& _mos_tic) {
+        const KOKKOS_MAT3 UBOt = Amat_init;
+        UMATS_prime(_mos_tic, 0) = _NABC * (UMATS(_mos_tic) * dRotMats(0) * RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_prime(_mos_tic, 1) = _NABC * (UMATS(_mos_tic) * RotMats(0) * dRotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_prime(_mos_tic, 2) = _NABC * (UMATS(_mos_tic) * RotMats(0) * RotMats(1) * dRotMats(2) * UBOt).transpose();
+
+        UMATS_dbl_prime(_mos_tic, 0) = _NABC * (UMATS(_mos_tic) * d2RotMats(0) * RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_dbl_prime(_mos_tic, 1) = _NABC * (UMATS(_mos_tic) * RotMats(0) * d2RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_dbl_prime(_mos_tic, 2) = _NABC * (UMATS(_mos_tic) * RotMats(0) * RotMats(1) * d2RotMats(2) * UBOt).transpose();
+
+        for (int i_uc=0; i_uc<6; i_uc++) {
+            BMATS_prime(_mos_tic, i_uc) = _NABC * (UMATS_RXYZ(_mos_tic) * eig_U * dB_mats(i_uc) * eig_O.transpose()).transpose();
+            BMATS_dbl_prime(_mos_tic, i_uc) = _NABC * (UMATS_RXYZ(_mos_tic) * eig_U * dB2_mats(i_uc) * eig_O.transpose()).transpose();
+        }
+    });
 
     if (use_diffuse){
         anisoG_local = anisoG;
         anisoU_local = anisoU;
 
-        num_laue_mats = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes);
-
-        for (int i_gam=0; i_gam<3; i_gam++){
-            dG_dgam[i_gam] << 0,0,0,0,0,0,0,0,0;
-            dG_dgam[i_gam](i_gam, i_gam) = 1;
+        if (laue_group_num < 1 || laue_group_num >14 ){
+            throw std::string("Laue group number not in range 1-14");
         }
+
         if (gamma_miller_units){
             anisoG_local = anisoG_local * Bmat_realspace;
+        }
+        Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){
+            num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes);
+            // KOKKOS_MAT3 rotate_principal_axes;
+            // rotate_principal_axes << 0.70710678,  -0.70710678,  0., 0.70710678,  0.70710678,  0., 0.,  0., 1.;
+
+            for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){
+                laue_mats(iL) = Ainv * laue_mats(iL);
+            }
+            // printf("Bmat =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", Bmat_realspace[i]);
+            // }
+            // printf("\n");
+            const KOKKOS_MAT3 Ginv = anisoG_local.inverse();
+            // printf("Ginv =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", Ginv[i]);
+            // }
+            // printf("\n");
+            const KOKKOS_MAT3 dG = Bmat_realspace * Ginv;
+            // printf("dG   =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", dG[i]);
+            // }
+            // printf("\n");
             for (int i_gam=0; i_gam<3; i_gam++){
-            dG_dgam[i_gam] = dG_dgam[i_gam] * Bmat_realspace;
+                if (gamma_miller_units) {
+                    dG_dgam(i_gam) = KOKKOS_VEC3(Bmat_realspace(i_gam, 0), Bmat_realspace(i_gam, 1), Bmat_realspace(i_gam, 2));
+                } else {
+                    dG_dgam(i_gam)[i_gam] = 1;
+                }
+                KOKKOS_MAT3 temp_dgam;
+                temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0];
+                temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1];
+                temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2];
+                dG_trace(i_gam) = (Ginv*temp_dgam).trace();
+                // printf("TRACE %g\n", dG_trace(i_gam));
+                // printf("dgam =");
+                // for (int i=0; i<9; ++i) {
+                //     printf(" %g", temp_dgam[i]);
+                // }
+                // printf("\n");
+
+                // dG(i_gam, i_gam);
             }
-        }
+        }, num_laue_mats);
+        anisoG_determ = anisoG_local.determinant();
         dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc
     }
-    Hmin << h_min, k_min, l_min;
-    Hmax << h_max, k_max, l_max;
-    dHH << dhh, dkk, dll;
-    Hrange << h_range, k_range, l_range;
+    const KOKKOS_VEC3 dHH (dhh, dkk, dll);
 
     const CUDAREAL overall_scale = r_e_sqr * spot_scale * fluence / Nsteps;
 
+    const CUDAREAL detector_attnlen_r = (detector_attnlen>0) ? 1 / detector_attnlen : 0;
+
     Kokkos::parallel_for(
         "sum_over_steps", Npix_to_model, KOKKOS_LAMBDA(const int& pixIdx) {
-            // __syncthreads();
 
-            // for (int pixIdx=tid; pixIdx < Npix_to_model; pixIdx+= thread_stride){
-            if (using_trusted_mask) {
-                if (!data_trusted(pixIdx))
-                    return;
-            }
-            int _pid = panels_fasts_slows(pixIdx * 3);
-            int _fpixel = panels_fasts_slows(pixIdx * 3 + 1);
-            int _spixel = panels_fasts_slows(pixIdx * 3 + 2);
-
-            CUDAREAL Fhkl_deriv_coef=0;
-            CUDAREAL Fhkl_hessian_coef=0;
-            if (Fhkl_gradient_mode) {
-                CUDAREAL u = data_residual(pixIdx);
-                CUDAREAL one_by_v = 1/data_variance(pixIdx);
-                CUDAREAL Gterm = 1 - 2*u - u*u*one_by_v;
-                Fhkl_deriv_coef = 0.5 * Gterm*one_by_v / data_freq(pixIdx);
-                if (Fhkl_errors_mode) {
-                    Fhkl_hessian_coef = -0.5*one_by_v*(one_by_v*Gterm - 2  - 2*u*one_by_v -u*u*one_by_v*one_by_v)/data_freq(pixIdx);
-                }
+        if (using_trusted_mask) {
+            if (!data_trusted(pixIdx))
+                return;
+        }
+        const int _pid = panels_fasts_slows(pixIdx * 3);
+        const int _fpixel = panels_fasts_slows(pixIdx * 3 + 1);
+        const int _spixel = panels_fasts_slows(pixIdx * 3 + 2);
+
+        CUDAREAL Fhkl_deriv_coef=0;
+        CUDAREAL Fhkl_hessian_coef=0;
+        if (Fhkl_gradient_mode) {
+            CUDAREAL u = data_residual(pixIdx);
+            CUDAREAL one_by_v = 1/data_variance(pixIdx);
+            CUDAREAL Gterm = 1 - 2*u - u*u*one_by_v;
+            Fhkl_deriv_coef = 0.5 * Gterm*one_by_v / data_freq(pixIdx);
+            if (Fhkl_errors_mode) {
+                Fhkl_hessian_coef = -0.5*one_by_v*(one_by_v*Gterm - 2  - 2*u*one_by_v -u*u*one_by_v*one_by_v)/data_freq(pixIdx);
             }
+        }
 
-            // int fcell_idx=1;
-            int nom_h = 0, nom_k = 0, nom_l = 0;
-            if (use_nominal_hkl) {
-                nom_h = nominal_hkl(pixIdx * 3);
-                nom_k = nominal_hkl(pixIdx * 3 + 1);
-                nom_l = nominal_hkl(pixIdx * 3 + 2);
-            }
-            CUDAREAL close_distance = close_distances(_pid);
-
-            // reset photon count for this pixel
-            double _I = 0;
-            double Ilambda = 0;
-
-            // reset derivative photon counts for the various parameters
-            double rot_manager_dI[3] = {0, 0, 0};
-            double rot_manager_dI2[3] = {0, 0, 0};
-            double ucell_manager_dI[6] = {0, 0, 0, 0, 0, 0};
-            double ucell_manager_dI2[6] = {0, 0, 0, 0, 0, 0};
-            double Ncells_manager_dI[6] = {0, 0, 0, 0, 0, 0};
-            double Ncells_manager_dI2[6] = {0, 0, 0, 0, 0, 0};
-            double pan_orig_manager_dI[3] = {0, 0, 0};
-            double pan_orig_manager_dI2[3] = {0, 0, 0};
-            double pan_rot_manager_dI[3] = {0, 0, 0};
-            double pan_rot_manager_dI2[3] = {0, 0, 0};
-            double fcell_manager_dI = 0;
-            double fcell_manager_dI2 = 0;
-            double eta_manager_dI[3] = {0, 0, 0};
-            double eta_manager_dI2[3] = {0, 0, 0};
-            double lambda_manager_dI[2] = {0, 0};
-            double lambda_manager_dI2[2] = {0, 0};
-            double fp_fdp_manager_dI[2] = {0, 0};
-            double dI_diffuse[6] = {0, 0, 0, 0, 0, 0};
-
-            for (int _subS = 0; _subS < oversample; ++_subS) {
-                for (int _subF = 0; _subF < oversample; ++_subF) {
-                    // absolute mm position on detector (relative to its origin)
-                    CUDAREAL _Fdet =
-                        subpixel_size * (_fpixel * oversample + _subF) + subpixel_size / 2.0;
-                    CUDAREAL _Sdet =
-                        subpixel_size * (_spixel * oversample + _subS) + subpixel_size / 2.0;
-
-                    // assume "distance" is to the front of the detector sensor layer
-                    int pid_x = _pid * 3;
-                    int pid_y = _pid * 3 + 1;
-                    int pid_z = _pid * 3 + 2;
-
-                    CUDAREAL fx = fdet_vectors(pid_x);
-                    CUDAREAL fy = fdet_vectors(pid_y);
-                    CUDAREAL fz = fdet_vectors(pid_z);
-                    CUDAREAL sx = sdet_vectors(pid_x);
-                    CUDAREAL sy = sdet_vectors(pid_y);
-                    CUDAREAL sz = sdet_vectors(pid_z);
-                    CUDAREAL ox = odet_vectors(pid_x);
-                    CUDAREAL oy = odet_vectors(pid_y);
-                    CUDAREAL oz = odet_vectors(pid_z);
-                    CUDAREAL px = pix0_vectors(pid_x);
-                    CUDAREAL py = pix0_vectors(pid_y);
-                    CUDAREAL pz = pix0_vectors(pid_z);
-
-                    KOKKOS_VEC3 _o_vec(ox, oy, oz);
-
-                    for (int _thick_tic = 0; _thick_tic < detector_thicksteps; ++_thick_tic) {
-                        CUDAREAL _Odet = _thick_tic * detector_thickstep;
-
-                        CUDAREAL pixposX = _Fdet * fx + _Sdet * sx + _Odet * ox + px;
-                        CUDAREAL pixposY = _Fdet * fy + _Sdet * sy + _Odet * oy + py;
-                        CUDAREAL pixposZ = _Fdet * fz + _Sdet * sz + _Odet * oz + pz;
-                        KOKKOS_VEC3 _pixel_pos(pixposX, pixposY, pixposZ);
-
-                        CUDAREAL _airpath = _pixel_pos.length();
-                        KOKKOS_VEC3 _diffracted = _pixel_pos.get_unit_vector();
-
-                        // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
-                        CUDAREAL _omega_pixel = pixel_size * pixel_size / _airpath / _airpath *
-                                                close_distance / _airpath;
-
-                        // option to turn off obliquity effect, inverse-square-law only
-                        if (point_pixel)
-                            _omega_pixel = 1.0 / _airpath / _airpath;
-
-                        // now calculate detector thickness effects
-                        CUDAREAL _capture_fraction = 1;
-
-                        if (detector_thick > 0.0 && detector_attnlen > 0.0) {
-                            // inverse of effective thickness increase
-                            CUDAREAL _parallax = _diffracted.dot(_o_vec);
-                            _capture_fraction = ::Kokkos::Experimental::exp(
-                                                    -_thick_tic * detector_thickstep /
-                                                    detector_attnlen / _parallax) -
-                                                ::Kokkos::Experimental::exp(
-                                                    -(_thick_tic + 1) * detector_thickstep /
-                                                    detector_attnlen / _parallax);
+        // int fcell_idx=1;
+        int nom_h = 0, nom_k = 0, nom_l = 0;
+        if (use_nominal_hkl) {
+            nom_h = nominal_hkl(pixIdx * 3);
+            nom_k = nominal_hkl(pixIdx * 3 + 1);
+            nom_l = nominal_hkl(pixIdx * 3 + 2);
+        }
+        CUDAREAL close_distance = close_distances(_pid);
+
+        // reset photon count for this pixel
+        double _I = 0;
+        double Ilambda = 0;
+
+        kokkos_manager dI, dI2;
+        dI.reset();
+        dI2.reset();
+
+        for (int _subS = 0; _subS < oversample; ++_subS) {
+            for (int _subF = 0; _subF < oversample; ++_subF) {
+                // absolute mm position on detector (relative to its origin)
+                CUDAREAL _Fdet =
+                    subpixel_size * (_fpixel * oversample + _subF) + subpixel_size / 2.0;
+                CUDAREAL _Sdet =
+                    subpixel_size * (_spixel * oversample + _subS) + subpixel_size / 2.0;
+
+                // assume "distance" is to the front of the detector sensor layer
+                int pid_x = _pid * 3;
+                int pid_y = _pid * 3 + 1;
+                int pid_z = _pid * 3 + 2;
+
+
+                CUDAREAL fx = fdet_vectors(pid_x);
+                CUDAREAL fy = fdet_vectors(pid_y);
+                CUDAREAL fz = fdet_vectors(pid_z);
+                CUDAREAL sx = sdet_vectors(pid_x);
+                CUDAREAL sy = sdet_vectors(pid_y);
+                CUDAREAL sz = sdet_vectors(pid_z);
+                CUDAREAL ox = odet_vectors(pid_x);
+                CUDAREAL oy = odet_vectors(pid_y);
+                CUDAREAL oz = odet_vectors(pid_z);
+                CUDAREAL px = pix0_vectors(pid_x);
+                CUDAREAL py = pix0_vectors(pid_y);
+                CUDAREAL pz = pix0_vectors(pid_z);
+                KOKKOS_VEC3 _o_vec(ox, oy, oz);
+
+                for (int _thick_tic = 0; _thick_tic < detector_thicksteps; ++_thick_tic) {
+
+                    CUDAREAL _Odet = _thick_tic * detector_thickstep;
+
+                    CUDAREAL pixposX = _Fdet * fx + _Sdet * sx + _Odet * ox + px;
+                    CUDAREAL pixposY = _Fdet * fy + _Sdet * sy + _Odet * oy + py;
+                    CUDAREAL pixposZ = _Fdet * fz + _Sdet * sz + _Odet * oz + pz;
+                    KOKKOS_VEC3 _pixel_pos(pixposX, pixposY, pixposZ);
+
+                    CUDAREAL _airpath_r = 1 / _pixel_pos.length();
+                    KOKKOS_VEC3 _diffracted = _pixel_pos.get_unit_vector();
+
+                    const CUDAREAL close_distance = close_distances(_pid);
+
+                    // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
+                    CUDAREAL _omega_pixel = pixel_size * pixel_size * _airpath_r * _airpath_r *
+                                            close_distance * _airpath_r;
+
+                    // option to turn off obliquity effect, inverse-square-law only
+                    if (point_pixel)
+                        _omega_pixel = _airpath_r * _airpath_r;
+
+                    // now calculate detector thickness effects
+                    CUDAREAL _capture_fraction = 1;
+
+                    CUDAREAL previous_layer = 1.0;
+                    if (detector_thick > 0.0 && detector_attnlen_r > 0.0) {
+                        // inverse of effective thickness increase
+                        KOKKOS_VEC3 _o_vec(ox, oy, oz);
+                        CUDAREAL _parallax = _diffracted.dot(_o_vec);
+                        CUDAREAL current_layer = ::Kokkos::exp(
+                                                -(_thick_tic + 1) * detector_thickstep *
+                                                detector_attnlen_r / _parallax);
+                        _capture_fraction = previous_layer - current_layer;
+                        previous_layer = current_layer;
+                    }
+
+                    for (int _source = 0; _source < sources; ++_source) {
+
+                        KOKKOS_VEC3 _incident(
+                            -source_X(_source), -source_Y(_source), -source_Z(_source));
+                        CUDAREAL _lambda = source_lambda(_source);
+                        CUDAREAL sI = source_I(_source);
+                        CUDAREAL lambda_ang = _lambda * 1e10;
+                        if (use_lambda_coefficients) {
+                            lambda_ang = lambda0 + lambda1 * lambda_ang;
+                            _lambda = lambda_ang * 1e-10;
                         }
-                        CUDAREAL cap_frac_times_omega = _capture_fraction * _omega_pixel;
-
-                        for (int _source = 0; _source < sources; ++_source) {
-                            KOKKOS_VEC3 _incident(
-                                -source_X(_source), -source_Y(_source), -source_Z(_source));
-                            CUDAREAL _lambda = source_lambda(_source);
-                            CUDAREAL sI = source_I(_source);
-                            CUDAREAL lambda_ang = _lambda * 1e10;
-                            if (use_lambda_coefficients) {
-                                lambda_ang = lambda0 + lambda1 * lambda_ang;
-                                _lambda = lambda_ang * 1e-10;
-                            }
 
-                            // polarization
-                            CUDAREAL polar_for_Fhkl_grad=1;
-                            if (!nopolar && Fhkl_gradient_mode){
-                                //polar_for_Fhkl_grad = diffBragg_gpu_kernel_polarization(_incident, _diffracted,
-                                //                    s_polarization_axis, s_kahn_factor);
-                                // component of diffracted unit vector along incident beam unit vector
-                                CUDAREAL cos2theta = _incident.dot(_diffracted);
-                                CUDAREAL cos2theta_sqr = cos2theta*cos2theta;
-                                CUDAREAL sin2theta_sqr = 1-cos2theta_sqr;
-
-                                CUDAREAL _psi=0;
-                                if(kahn_factor != 0.0){
-                                    // cross product to get "vertical" axis that is orthogonal to the cannonical "polarization"
-                                    KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
-                                    // cross product with incident beam to get E-vector direction
-                                    KOKKOS_VEC3 E_in = _incident.cross(B_in);
-                                    // get components of diffracted ray projected onto the E-B plane
-                                    CUDAREAL _kEi = _diffracted.dot(E_in);
-                                    CUDAREAL _kBi = _diffracted.dot(B_in);
-                                    // compute the angle of the diffracted ray projected onto the incident E-B plane
-                                    _psi = -atan2(_kBi,_kEi);
+                        // polarization
+                        CUDAREAL polar_for_Fhkl_grad=1;
+                        if (!nopolar && Fhkl_gradient_mode){
+
+                            // component of diffracted unit vector along incident beam unit vector
+                            CUDAREAL cos2theta = _incident.dot(_diffracted);
+                            CUDAREAL cos2theta_sqr = cos2theta*cos2theta;
+                            CUDAREAL sin2theta_sqr = 1-cos2theta_sqr;
+
+                            CUDAREAL cos2psi=0;
+                            if(kahn_factor != 0.0){
+                                // cross product to get "vertical" axis that is orthogonal to the cannonical "polarization"
+                                KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
+                                // cross product with incident beam to get E-vector direction
+                                KOKKOS_VEC3 E_in = _incident.cross(B_in);
+                                // get components of diffracted ray projected onto the E-B plane
+                                CUDAREAL _kEi = _diffracted.dot(E_in);
+                                CUDAREAL _kBi = _diffracted.dot(B_in);
+                                // compute the angle of the diffracted ray projected onto the incident E-B plane
+                                // calculate cos(2 * atan2(_kBi, _kEi))
+                                if (_kEi!=0) {
+                                    CUDAREAL ratio = _kBi / _kEi;
+                                    cos2psi = (1 - ratio*ratio) / (1 + ratio*ratio);
+                                } else {
+                                    cos2psi = -1;
                                 }
-                                // correction for polarized incident beam
-                                polar_for_Fhkl_grad = 0.5*(1.0 + cos2theta_sqr - kahn_factor*cos(2*_psi)*sin2theta_sqr);
+                            }
+                            // correction for polarized incident beam
+                            polar_for_Fhkl_grad = 0.5*(1.0 + cos2theta_sqr - kahn_factor*cos2psi*sin2theta_sqr);
+                        }
+                        KOKKOS_VEC3 _scattering = (_diffracted - _incident) / _lambda;
+
+                        KOKKOS_VEC3 q_vec = _scattering * 1e-10;
+
+                        // TODO rename
+                        CUDAREAL texture_scale = _capture_fraction * _omega_pixel * sI;
+
+                        for (int _mos_tic = 0; _mos_tic < mosaic_domains; ++_mos_tic) {
+                            const KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+
+                            KOKKOS_VEC3 H_vec = UBO * q_vec;
+                            CUDAREAL _h = H_vec[0];
+                            CUDAREAL _k = H_vec[1];
+                            CUDAREAL _l = H_vec[2];
+
+                            int _h0 = ceil(_h - 0.5);
+                            int _k0 = ceil(_k - 0.5);
+                            int _l0 = ceil(_l - 0.5);
+
+                            KOKKOS_VEC3 H0(_h0, _k0, _l0);
+
+                            KOKKOS_VEC3 delta_H = H_vec - H0;
+                            KOKKOS_VEC3 V = _NABC * delta_H;
+                            CUDAREAL _hrad_sqr = V.length_sqr();
+                            CUDAREAL exparg = _hrad_sqr * C / 2;
+                            CUDAREAL I0 = 0;
+
+                            if (exparg < 35)
+                                if (no_Nabc_scale)
+                                    I0 = ::Kokkos::exp(-2 * exparg);
+                                else
+                                    I0 = (NABC_det_sq) *
+                                            ::Kokkos::exp(-2 * exparg);
+
+                            // are we doing diffuse scattering
+                            CUDAREAL step_diffuse_param[6] = {0, 0, 0, 0, 0, 0};
+                            if (use_diffuse) {
+                                calc_diffuse_at_hkl(H_vec,H0,dHH,h_min,k_min,l_min,h_max,k_max,l_max,h_range,k_range,l_range,Ainv,FhklLinear,num_laue_mats,laue_mats,anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,(refine_flag & REFINE_DIFFUSE)>0,&I0,step_diffuse_param);
+                            } // end s_use_diffuse outer
+
+                            CUDAREAL _F_cell = default_F;
+                            CUDAREAL _F_cell2 = 0;
+                            int i_hklasu=0;
+
+                            if ((_h0 <= h_max) && (_h0 >= h_min) &&
+                                (_k0 <= k_max) && (_k0 >= k_min) &&
+                                (_l0 <= l_max) && (_l0 >= l_min)) {
+                                int Fhkl_linear_index = (_h0 - h_min) * k_range * l_range +
+                                                        (_k0 - k_min) * l_range + (_l0 - l_min);
+                                //_F_cell = __ldg(&FhklLinear[Fhkl_linear_index]);
+                                _F_cell = FhklLinear(Fhkl_linear_index);
+                                // if (complex_miller) _F_cell2 =
+                                // __ldg(&Fhkl2Linear[Fhkl_linear_index]);
+                                if (complex_miller)
+                                    _F_cell2 = Fhkl2Linear(Fhkl_linear_index);
+                                if (Fhkl_have_scale_factors)
+                                    i_hklasu = FhklLinear_ASUid(Fhkl_linear_index);
                             }
 
-                            KOKKOS_VEC3 _scattering = (_diffracted - _incident) / _lambda;
-
-                            KOKKOS_VEC3 q_vec(_scattering[0], _scattering[1], _scattering[2]);
-                            q_vec *= 1e-10;
-
-                            // TODO rename
-                            CUDAREAL texture_scale = 1;
-                            texture_scale *= cap_frac_times_omega;
-                            texture_scale *= sI;
+                            CUDAREAL c_deriv_Fcell = 0;
+                            CUDAREAL d_deriv_Fcell = 0;
+                            if (complex_miller) {
+                                CUDAREAL c_deriv_Fcell_real = 0;
+                                CUDAREAL c_deriv_Fcell_imag = 0;
+                                CUDAREAL d_deriv_Fcell_real = 0;
+                                CUDAREAL d_deriv_Fcell_imag = 0;
+                                if (num_atoms > 0) {
+                                    CUDAREAL S_2 = (q_vec[0] * q_vec[0] +
+                                                    q_vec[1] * q_vec[1] +
+                                                    q_vec[2] * q_vec[2]);
+
+                                    // fp is always followed by the fdp value
+                                    CUDAREAL val_fp = fpfdp(2 * _source);
+                                    CUDAREAL val_fdp = fpfdp(2 * _source + 1);
+
+                                    CUDAREAL c_deriv_prime = 0;
+                                    CUDAREAL c_deriv_dblprime = 0;
+                                    CUDAREAL d_deriv_prime = 0;
+                                    CUDAREAL d_deriv_dblprime = 0;
+                                    if (refine_flag & REFINE_FP_FDP) {
+                                        //   currently only supports two parameter model
+                                        int d_idx = 2 * _source;
+                                        c_deriv_prime = fpfdp_derivs(d_idx);
+                                        c_deriv_dblprime = fpfdp_derivs(d_idx + 1);
+                                        d_deriv_prime = fpfdp_derivs(d_idx + 2 * sources);
+                                        d_deriv_dblprime =
+                                            fpfdp_derivs(d_idx + 1 + 2 * sources);
+                                    }
 
-                            for (int _mos_tic = 0; _mos_tic < mosaic_domains; ++_mos_tic) {
-                                int amat_idx = _mos_tic;
-                                KOKKOS_MAT3 UBO = Amatrices(amat_idx);
+                                    for (int i_atom = 0; i_atom < num_atoms; i_atom++) {
+                                        // fractional atomic coordinates
+                                        CUDAREAL atom_x = atom_data(i_atom * 5);
+                                        CUDAREAL atom_y = atom_data(i_atom * 5 + 1);
+                                        CUDAREAL atom_z = atom_data(i_atom * 5 + 2);
+                                        CUDAREAL B = atom_data(i_atom * 5 + 3);  // B factor
+                                        B = ::Kokkos::exp(
+                                            -B * S_2 / 4.0);  // TODO: speed me up?
+                                        CUDAREAL occ = atom_data(i_atom * 5 + 4);  // occupancy
+                                        CUDAREAL r_dot_h =
+                                            _h0 * atom_x + _k0 * atom_y + _l0 * atom_z;
+                                        CUDAREAL phase = 2 * M_PI * r_dot_h;
+                                        CUDAREAL s_rdoth = ::Kokkos::sin(phase);
+                                        CUDAREAL c_rdoth = ::Kokkos::cos(phase);
+                                        CUDAREAL Bocc = B * occ;
+                                        CUDAREAL BC = B * c_rdoth;
+                                        CUDAREAL BS = B * s_rdoth;
+                                        CUDAREAL real_part = BC * val_fp - BS * val_fdp;
+                                        CUDAREAL imag_part = BS * val_fp + BC * val_fdp;
+                                        _F_cell += real_part;
+                                        _F_cell2 += imag_part;
+                                        if (refine_flag & REFINE_FP_FDP) {
+                                            c_deriv_Fcell_real +=
+                                                BC * c_deriv_prime - BS * c_deriv_dblprime;
+                                            c_deriv_Fcell_imag +=
+                                                BS * c_deriv_prime + BC * c_deriv_dblprime;
+
+                                            d_deriv_Fcell_real +=
+                                                BC * d_deriv_prime - BS * d_deriv_dblprime;
+                                            d_deriv_Fcell_imag +=
+                                                BS * d_deriv_prime + BC * d_deriv_dblprime;
+                                        }
+                                    }
+                                }
+                                CUDAREAL Freal = _F_cell;
+                                CUDAREAL Fimag = _F_cell2;
+                                _F_cell =
+                                    ::Kokkos::sqrt(Freal * Freal + Fimag * Fimag);
+                                if (refine_flag & REFINE_FP_FDP) {
+                                    c_deriv_Fcell =
+                                        Freal * c_deriv_Fcell_real + Fimag * c_deriv_Fcell_imag;
+                                    d_deriv_Fcell =
+                                        Freal * d_deriv_Fcell_real + Fimag * d_deriv_Fcell_imag;
+                                }
+                            }
+                            if (!oversample_omega && ! Fhkl_gradient_mode)
+                                _omega_pixel = 1;
+
+                            CUDAREAL _I_cell = _F_cell;
+                            if (!(refine_flag & REFINE_ICELL))
+                                _I_cell *= _F_cell;
+                            CUDAREAL hkl=1;
+                            int Fhkl_channel=0;
+                            if (! Fhkl_channels_empty)
+                                Fhkl_channel = Fhkl_channels(_source);
+                            if (Fhkl_have_scale_factors)
+                                hkl = Fhkl_scale(i_hklasu + Fhkl_channel*Num_ASU);
+                            if (Fhkl_gradient_mode){
+                                CUDAREAL Fhkl_deriv_scale = overall_scale*polar_for_Fhkl_grad;
+                                CUDAREAL I_noFcell=texture_scale*I0;
+                                CUDAREAL dfhkl = I_noFcell*_I_cell * Fhkl_deriv_scale;
+                                CUDAREAL grad_incr = dfhkl*Fhkl_deriv_coef;
+                                int fhkl_grad_idx=i_hklasu + Fhkl_channel*Num_ASU;
+
+                                if (Fhkl_errors_mode){
+                                    // here we hi-kack the Fhkl_scale_deriv array, if computing errors, in order to store the hessian terms
+                                    // if we are getting the hessian terms, we no longer need the  gradients (e.g. by this point we are done refininig)
+                                    CUDAREAL hessian_incr = Fhkl_hessian_coef*dfhkl*dfhkl;
+                                    ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), hessian_incr);
+                                }
+                                else{
+                                    ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), grad_incr);
+                                }
+                                continue;
+                            }
 
-                                KOKKOS_VEC3 H_vec = UBO * q_vec;
-                                CUDAREAL _h = H_vec[0];
-                                CUDAREAL _k = H_vec[1];
-                                CUDAREAL _l = H_vec[2];
+                            CUDAREAL _I_total = hkl*_I_cell *I0;
+                            CUDAREAL Iincrement = _I_total * texture_scale;
+                            _I += Iincrement;
+                            if (save_wavelenimage)
+                                Ilambda += Iincrement * lambda_ang;
+
+                            if (refine_flag & REFINE_DIFFUSE) {
+                                CUDAREAL step_scale = texture_scale * _F_cell * _F_cell;
+                                for (int i_diff = 0; i_diff < 6; i_diff++) {
+                                    dI.diffuse[i_diff] +=
+                                        step_scale * step_diffuse_param[i_diff];
+                                }
+                            }
 
-                                int _h0 = ceil(_h - 0.5);
-                                int _k0 = ceil(_k - 0.5);
-                                int _l0 = ceil(_l - 0.5);
+                            //*************************************************
+                            // START REFINEMENT
 
-                                KOKKOS_VEC3 H0(_h0, _k0, _l0);
+                            if (refine_flag & REFINE_FP_FDP) {
+                                CUDAREAL I_noFcell = texture_scale * I0;
+                                dI.fp_fdp[0] += 2 * I_noFcell * (c_deriv_Fcell);
+                                dI.fp_fdp[1] += 2 * I_noFcell * (d_deriv_Fcell);
+                            }
 
-                                KOKKOS_VEC3 delta_H = H_vec - H0;
-                                KOKKOS_VEC3 V = _NABC * delta_H;
-                                CUDAREAL _hrad_sqr = V.dot(V);
-                                CUDAREAL exparg = _hrad_sqr * C / 2;
-                                CUDAREAL I0 = 0;
+                            if (verbose > 3)
+                                printf(
+                                    "hkl= %f %f %f  hkl1= %d %d %d  Fcell=%f\n", _h, _k, _l,
+                                    _h0, _k0, _l0, _F_cell);
 
-                                if (exparg < 35)
-                                    if (no_Nabc_scale)
-                                        I0 = ::Kokkos::Experimental::exp(-2 * exparg);
-                                    else
-                                        I0 = (NABC_det_sq) *
-                                             ::Kokkos::Experimental::exp(-2 * exparg);
-
-                                // are we doing diffuse scattering
-                                CUDAREAL step_diffuse_param[6] = {0, 0, 0, 0, 0, 0};
-                                if (use_diffuse) {
-                                    calc_diffuse_at_hkl(H_vec,H0,dHH,Hmin,Hmax,Hrange,Ainv,FhklLinear,num_laue_mats,laue_mats,anisoG_local,anisoU_local,dG_dgam,refine_diffuse,&I0,step_diffuse_param);
-                                } // end s_use_diffuse outer
-
-                                CUDAREAL _F_cell = default_F;
-                                CUDAREAL _F_cell2 = 0;
-                                int i_hklasu=0;
-
-                                if ((_h0 <= h_max) && (_h0 >= h_min) && (_k0 <= k_max) &&
-                                    (_k0 >= k_min) && (_l0 <= l_max) && (_l0 >= l_min)) {
-                                    int Fhkl_linear_index = (_h0 - h_min) * k_range * l_range +
-                                                            (_k0 - k_min) * l_range + (_l0 - l_min);
-                                    //_F_cell = __ldg(&FhklLinear[Fhkl_linear_index]);
-                                    _F_cell = FhklLinear(Fhkl_linear_index);
-                                    // if (complex_miller) _F_cell2 =
-                                    // __ldg(&Fhkl2Linear[Fhkl_linear_index]);
-                                    if (complex_miller)
-                                        _F_cell2 = Fhkl2Linear(Fhkl_linear_index);
-                                    if (Fhkl_have_scale_factors)
-                                        i_hklasu = FhklLinear_ASUid(Fhkl_linear_index);
+                            KOKKOS_MAT3 UBOt;
+                             if (refine_flag & (REFINE_UMAT | REFINE_ETA)) {
+                                UBOt = Amat_init;
+                            }
+                            if (refine_flag & REFINE_UMAT1) {
+                                const KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 0) * q_vec;
+                                const CUDAREAL V_dot_dV = V.dot(dV);
+                                const CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    const CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 0)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
                                 }
-
-                                CUDAREAL c_deriv_Fcell = 0;
-                                CUDAREAL d_deriv_Fcell = 0;
-                                if (complex_miller) {
-                                    CUDAREAL c_deriv_Fcell_real = 0;
-                                    CUDAREAL c_deriv_Fcell_imag = 0;
-                                    CUDAREAL d_deriv_Fcell_real = 0;
-                                    CUDAREAL d_deriv_Fcell_imag = 0;
-                                    if (num_atoms > 0) {
-                                        CUDAREAL S_2 = 1.e-20 * (_scattering[0] * _scattering[0] +
-                                                                 _scattering[1] * _scattering[1] +
-                                                                 _scattering[2] * _scattering[2]);
-
-                                        // fp is always followed by the fdp value
-                                        CUDAREAL val_fp = fpfdp(2 * _source);
-                                        CUDAREAL val_fdp = fpfdp(2 * _source + 1);
-
-                                        CUDAREAL c_deriv_prime = 0;
-                                        CUDAREAL c_deriv_dblprime = 0;
-                                        CUDAREAL d_deriv_prime = 0;
-                                        CUDAREAL d_deriv_dblprime = 0;
-                                        if (refine_fp_fdp) {
-                                            //   currently only supports two parameter model
-                                            int d_idx = 2 * _source;
-                                            c_deriv_prime = fpfdp_derivs(d_idx);
-                                            c_deriv_dblprime = fpfdp_derivs(d_idx + 1);
-                                            d_deriv_prime = fpfdp_derivs(d_idx + 2 * sources);
-                                            d_deriv_dblprime =
-                                                fpfdp_derivs(d_idx + 1 + 2 * sources);
-                                        }
-
-                                        for (int i_atom = 0; i_atom < num_atoms; i_atom++) {
-                                            // fractional atomic coordinates
-                                            CUDAREAL atom_x = atom_data(i_atom * 5);
-                                            CUDAREAL atom_y = atom_data(i_atom * 5 + 1);
-                                            CUDAREAL atom_z = atom_data(i_atom * 5 + 2);
-                                            CUDAREAL B = atom_data(i_atom * 5 + 3);  // B factor
-                                            B = ::Kokkos::Experimental::exp(
-                                                -B * S_2 / 4.0);  // TODO: speed me up?
-                                            CUDAREAL occ = atom_data(i_atom * 5 + 4);  // occupancy
-                                            CUDAREAL r_dot_h =
-                                                _h0 * atom_x + _k0 * atom_y + _l0 * atom_z;
-                                            CUDAREAL phase = 2 * M_PI * r_dot_h;
-                                            CUDAREAL s_rdoth = ::Kokkos::Experimental::sin(phase);
-                                            CUDAREAL c_rdoth = ::Kokkos::Experimental::cos(phase);
-                                            CUDAREAL Bocc = B * occ;
-                                            CUDAREAL BC = B * c_rdoth;
-                                            CUDAREAL BS = B * s_rdoth;
-                                            CUDAREAL real_part = BC * val_fp - BS * val_fdp;
-                                            CUDAREAL imag_part = BS * val_fp + BC * val_fdp;
-                                            _F_cell += real_part;
-                                            _F_cell2 += imag_part;
-                                            if (refine_fp_fdp) {
-                                                c_deriv_Fcell_real +=
-                                                    BC * c_deriv_prime - BS * c_deriv_dblprime;
-                                                c_deriv_Fcell_imag +=
-                                                    BS * c_deriv_prime + BC * c_deriv_dblprime;
-
-                                                d_deriv_Fcell_real +=
-                                                    BC * d_deriv_prime - BS * d_deriv_dblprime;
-                                                d_deriv_Fcell_imag +=
-                                                    BS * d_deriv_prime + BC * d_deriv_dblprime;
-                                            }
-                                        }
-                                    }
-                                    CUDAREAL Freal = _F_cell;
-                                    CUDAREAL Fimag = _F_cell2;
-                                    _F_cell =
-                                        ::Kokkos::Experimental::sqrt(Freal * Freal + Fimag * Fimag);
-                                    if (refine_fp_fdp) {
-                                        c_deriv_Fcell =
-                                            Freal * c_deriv_Fcell_real + Fimag * c_deriv_Fcell_imag;
-                                        d_deriv_Fcell =
-                                            Freal * d_deriv_Fcell_real + Fimag * d_deriv_Fcell_imag;
+                                dI.rot[0] += value;
+                                dI2.rot[0] += value2;
+                            }
+                            if (refine_flag & REFINE_UMAT2) {
+                                KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 1) * q_vec;
+                                CUDAREAL V_dot_dV = V.dot(dV);
+                                CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 1)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
+                                }
+                                dI.rot[1] += value;
+                                dI2.rot[1] += value2;
+                            }
+                            if (refine_flag & REFINE_UMAT3) {
+                                KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 2) * q_vec;
+                                CUDAREAL V_dot_dV = V.dot(dV);
+                                CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 2)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
+                                }
+                                dI.rot[2] += value;
+                                dI2.rot[2] += value2;
+                            }
+                            // Checkpoint for unit cell derivatives
+                            for (int i_uc = 0; i_uc < 6; i_uc++) {
+                                if (refine_flag & (REFINE_BMAT1 << i_uc)) {
+                                    KOKKOS_VEC3 dV = BMATS_prime(_mos_tic, i_uc) * q_vec;
+                                    CUDAREAL V_dot_dV = V.dot(dV);
+                                    CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+                                    CUDAREAL value2 = 0;
+                                    if (compute_curvatures) {
+                                        const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                        CUDAREAL dV2_dot_V = V.dot(BMATS_dbl_prime(_mos_tic, i_uc)*q_vec);
+                                        value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
                                     }
+                                    dI.ucell[i_uc] += value;
+                                    dI2.ucell[i_uc] += value2;
                                 }
-                                if (!oversample_omega && ! Fhkl_gradient_mode)
-                                    _omega_pixel = 1;
-
-                                CUDAREAL _I_cell = _F_cell;
-                                if (!refine_Icell)
-                                    _I_cell *= _F_cell;
-                                CUDAREAL hkl=1;
-                                int Fhkl_channel=0;
-                                if (! Fhkl_channels_empty)
-                                    Fhkl_channel = Fhkl_channels(_source);
-                                if (Fhkl_have_scale_factors)
-                                    hkl = Fhkl_scale(i_hklasu + Fhkl_channel*Num_ASU);
-                                if (Fhkl_gradient_mode){
-                                    CUDAREAL Fhkl_deriv_scale = overall_scale*polar_for_Fhkl_grad;
-                                    CUDAREAL I_noFcell=texture_scale*I0;
-                                    CUDAREAL dfhkl = I_noFcell*_I_cell * Fhkl_deriv_scale;
-                                    CUDAREAL grad_incr = dfhkl*Fhkl_deriv_coef;
-                                    int fhkl_grad_idx=i_hklasu + Fhkl_channel*Num_ASU;
-
-                                    if (Fhkl_errors_mode){
-                                        // here we hi-kack the Fhkl_scale_deriv array, if computing errors, in order to store the hessian terms
-                                        // if we are getting the hessian terms, we no longer need the  gradients (e.g. by this point we are done refininig)
-                                        CUDAREAL hessian_incr = Fhkl_hessian_coef*dfhkl*dfhkl;
-                                        ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), hessian_incr);
+                            }  // end ucell deriv
+
+                            // Checkpoint for Ncells manager
+                            if (refine_flag & REFINE_NCELLS1) {
+                                int num_ncell_deriv = 1;
+                                if (!isotropic_ncells)
+                                    num_ncell_deriv = 3;
+                                for (int i_nc = 0; i_nc < num_ncell_deriv; i_nc++) {
+                                    KOKKOS_MAT3 dN;
+                                    dN(i_nc, i_nc) = 1;
+                                    if (num_ncell_deriv == 1) {
+                                        dN(0, 0) = 1;
+                                        dN(1, 1) = 1;
+                                        dN(2, 2) = 1;
                                     }
-                                    else{
-                                        ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), grad_incr);
+                                    CUDAREAL N_i = _NABC(i_nc, i_nc);
+                                    KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
+                                    // TODO speedops: precompute these, store shared var
+                                    // _NABC.inverse
+                                    CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
+                                    CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
+                                    CUDAREAL value = 2 * Iincrement * deriv_coef;
+                                    CUDAREAL value2 = 0;
+                                    if (compute_curvatures) {
+                                        value2 = (-1 / N_i / N_i - C * (dV_dN.dot(dV_dN))) * 2 *
+                                                    Iincrement;
+                                        value2 += deriv_coef * 2 * value;
                                     }
-                                    continue;
+                                    dI.Ncells[i_nc] += value;
+                                    dI2.Ncells[i_nc] += value2;
                                 }
-
-                                CUDAREAL _I_total = hkl*_I_cell *I0;
-                                CUDAREAL Iincrement = _I_total * texture_scale;
-                                _I += Iincrement;
-                                if (save_wavelenimage)
-                                    Ilambda += Iincrement * lambda_ang;
-
-                                if (refine_diffuse) {
-                                    CUDAREAL step_scale = texture_scale * _F_cell * _F_cell;
-                                    for (int i_diff = 0; i_diff < 6; i_diff++) {
-                                        dI_diffuse[i_diff] +=
-                                            step_scale * step_diffuse_param[i_diff];
+                            }  // end Ncells manager deriv
+
+                            if (refine_flag & REFINE_NCELLS_DEF) {
+                                for (int i_nc = 3; i_nc < 6; i_nc++) {
+                                    KOKKOS_MAT3 dN;
+                                    if (i_nc == 3)
+                                        dN = KOKKOS_MAT3{0, 1, 0, 1, 0, 0, 0, 0, 0};
+                                    else if (i_nc == 4)
+                                        dN = KOKKOS_MAT3{0, 0, 0, 0, 0, 1, 0, 1, 0};
+                                    else
+                                        dN = KOKKOS_MAT3{0, 0, 1, 0, 0, 0, 1, 0, 0};
+                                    KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
+                                    // TODO speedops: precompute these
+                                    CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
+                                    CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
+                                    CUDAREAL value = 2 * Iincrement * deriv_coef;
+                                    dI.Ncells[i_nc] += value;
+                                    CUDAREAL value2 = 0;
+                                    if (compute_curvatures) {
+                                        value2 = deriv_coef * value;
+                                        value2 += -2 * C * Iincrement * (dV_dN.dot(dV_dN));
+                                        dI2.Ncells[i_nc] += value2;
                                     }
                                 }
+                            }
 
-                                if (refine_fp_fdp) {
-                                    CUDAREAL I_noFcell = texture_scale * I0;
-                                    fp_fdp_manager_dI[0] += 2 * I_noFcell * (c_deriv_Fcell);
-                                    fp_fdp_manager_dI[1] += 2 * I_noFcell * (d_deriv_Fcell);
-                                }
+                            // Checkpoint for Origin manager
+                            for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
+                                if (refine_flag & (REFINE_PANEL_ORIGIN1 << i_pan_orig)) {
+                                    CUDAREAL per_k = _airpath_r;
+                                    CUDAREAL per_k3 = pow(per_k, 3.);
+                                    CUDAREAL per_k5 = pow(per_k, 5.);
+
+                                    KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
+                                    KOKKOS_VEC3 dk;
+                                    if (i_pan_orig == 0)
+                                        dk = KOKKOS_VEC3{0, 0, 1};
+                                    else if (i_pan_orig == 1)
+                                        dk = KOKKOS_VEC3{1, 0, 0};
+                                    else
+                                        dk = KOKKOS_VEC3{0, 1, 0};
+
+                                    CUDAREAL G = dk.dot(_pixel_pos);
+                                    CUDAREAL pix2 = subpixel_size * subpixel_size;
+                                    KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
+                                    CUDAREAL coef = (M.dot(dk_hat)).dot(V);
+                                    CUDAREAL coef2 =
+                                        -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
+                                    coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
+                                    CUDAREAL value =
+                                        coef * Iincrement + coef2 * Iincrement / _omega_pixel;
+
+                                    dI.pan_orig[i_pan_orig] += value;
+                                    dI2.pan_orig[i_pan_orig] += 0;
+
+                                }  // end origin manager deriv
+                            }
 
-                                if (verbose > 3)
-                                    printf(
-                                        "hkl= %f %f %f  hkl1= %d %d %d  Fcell=%f\n", _h, _k, _l,
-                                        _h0, _k0, _l0, _F_cell);
+                            for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
+                                if (refine_flag & (REFINE_PANEL_ROT1 << i_pan_rot)) {
+                                    CUDAREAL per_k = _airpath_r;
+                                    CUDAREAL per_k3 = pow(per_k, 3.);
+                                    CUDAREAL per_k5 = pow(per_k, 5.);
+                                    KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
+                                    KOKKOS_VEC3 dk = _Fdet * (dF_vecs(_pid * 3 + i_pan_rot)) +
+                                                _Sdet * (dS_vecs(_pid * 3 + i_pan_rot));
+                                    CUDAREAL G = dk.dot(_pixel_pos);
+                                    CUDAREAL pix2 = subpixel_size * subpixel_size;
+                                    KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
+                                    CUDAREAL coef = (M.dot(dk_hat)).dot(V);
+                                    CUDAREAL coef2 =
+                                        -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
+                                    coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
+                                    CUDAREAL value =
+                                        coef * Iincrement + coef2 * Iincrement / _omega_pixel;
+
+                                    dI.pan_rot[i_pan_rot] += value;
+                                    dI2.pan_rot[i_pan_rot] += 0;
+                                }
+                            }
 
-                                KOKKOS_MAT3 UBOt;
-                                if (refine_Umat(0) || refine_Umat(1) || refine_Umat(2) ||
-                                    refine_eta) {
-                                    UBOt = Amat_init;
+                            // checkpoint for Fcell manager
+                            if (refine_flag & REFINE_FCELL) {
+                                CUDAREAL value;
+                                if (refine_flag & REFINE_ICELL)
+                                    value = I0 * texture_scale;
+                                else
+                                    value = 2 * I0 * _F_cell *
+                                            texture_scale;  // Iincrement/_F_cell ;
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    //    NOTE if _Fcell >0
+                                    value2 = 2 * I0 * texture_scale;
                                 }
-                                if (refine_Umat(0)) {
-                                    KOKKOS_MAT3 RyRzUBOt = RotMats(1) * RotMats(2) * UBOt;
-                                    KOKKOS_VEC3 delta_H_prime =
-                                        (UMATS(_mos_tic) * dRotMats(0) * RyRzUBOt)
-                                            .transpose()
-                                            .dot(q_vec);
-                                    CUDAREAL V_dot_dV = V.dot(_NABC.dot(delta_H_prime));
-                                    CUDAREAL value = -two_C * V_dot_dV * Iincrement;
-                                    CUDAREAL value2 = 0;
-                                    if (compute_curvatures) {
-                                        KOKKOS_VEC3 delta_H_dbl_prime =
-                                            (UMATS(_mos_tic).dot(d2RotMats(0).dot(RyRzUBOt)))
-                                                .transpose()
-                                                .dot(q_vec);
-                                        CUDAREAL dV_dot_dV = (_NABC.dot(delta_H_prime))
-                                                                 .dot(_NABC.dot(delta_H_prime));
-                                        CUDAREAL dV2_dot_V =
-                                            (_NABC.dot(delta_H)).dot(_NABC.dot(delta_H_dbl_prime));
-                                        value2 =
-                                            two_C *
-                                            (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) *
-                                            Iincrement;
+                                // if (fcell_idx >=0 && fcell_idx <=2){
+                                if (use_nominal_hkl) {
+                                    if (_h0 == nom_h && _k0 == nom_k && _l0 == nom_l) {
+                                        dI.fcell += value;
+                                        dI2.fcell += value2;
                                     }
-                                    rot_manager_dI[0] += value;
-                                    rot_manager_dI2[0] += value2;
+                                } else {
+                                    dI.fcell += value;
+                                    dI2.fcell += value2;
                                 }
-                                if (refine_Umat(1)) {
-                                    KOKKOS_MAT3 UmosRx = UMATS(_mos_tic).dot(RotMats(0));
-                                    KOKKOS_MAT3 RzUBOt = RotMats(2).dot(UBOt);
-                                    KOKKOS_VEC3 delta_H_prime = (UmosRx.dot(dRotMats(1).dot(RzUBOt)))
-                                                             .transpose()
-                                                             .dot(q_vec);
-                                    CUDAREAL V_dot_dV = V.dot(_NABC.dot(delta_H_prime));
-                                    CUDAREAL value = -two_C * V_dot_dV * Iincrement;
-
-                                    CUDAREAL value2 = 0;
+                            }  // end of fcell man deriv
+
+                            // checkpoint for eta manager
+                            if (refine_flag & REFINE_ETA) {
+                                for (int i_eta = 0; i_eta < 3; i_eta++) {
+                                    if (i_eta > 0 && !aniso_eta)
+                                        continue;
+                                    int mtic2 = _mos_tic + i_eta * mosaic_domains;
+                                    KOKKOS_VEC3 DeltaH_deriv = (UMATS_RXYZ_prime(mtic2).dot(UBOt))
+                                                            .transpose()
+                                                            .dot(q_vec);
+                                    // vector V is _Nabc*Delta_H
+                                    KOKKOS_VEC3 dV = _NABC.dot(DeltaH_deriv);
+                                    CUDAREAL V_dot_dV = V.dot(dV);
+                                    CUDAREAL Iprime = -two_C * (V_dot_dV)*Iincrement;
+                                    dI.eta[i_eta] += Iprime;
+                                    CUDAREAL Idbl_prime = 0;
                                     if (compute_curvatures) {
-                                        KOKKOS_VEC3 delta_H_dbl_prime =
-                                            (UmosRx.dot(d2RotMats(1).dot(RzUBOt)))
+                                        KOKKOS_VEC3 DeltaH_second_deriv =
+                                            (UMATS_RXYZ_dbl_prime(mtic2).dot(UBOt))
                                                 .transpose()
                                                 .dot(q_vec);
-                                        CUDAREAL dV_dot_dV = (_NABC.dot(delta_H_prime))
-                                                                 .dot(_NABC.dot(delta_H_prime));
-                                        CUDAREAL dV2_dot_V =
-                                            (_NABC.dot(delta_H)).dot(_NABC.dot(delta_H_dbl_prime));
-                                        value2 =
-                                            two_C *
-                                            (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) *
-                                            Iincrement;
+                                        KOKKOS_VEC3 dV2 = _NABC.dot(DeltaH_second_deriv);
+                                        Idbl_prime =
+                                            -two_C * (dV.dot(dV) + V.dot(dV2)) * Iincrement;
+                                        Idbl_prime += -two_C * (V_dot_dV)*Iprime;
                                     }
-                                    rot_manager_dI[1] += value;
-                                    rot_manager_dI2[1] += value2;
+                                    dI2.eta[i_eta] += Idbl_prime;
                                 }
-                                if (refine_Umat(2)) {
-                                    KOKKOS_MAT3 UmosRxRy = UMATS(_mos_tic).dot(RotMats(0).dot(RotMats(1)));
-                                    KOKKOS_VEC3 delta_H_prime = (UmosRxRy.dot(dRotMats(2).dot(UBOt)))
-                                                             .transpose()
-                                                             .dot(q_vec);
-                                    CUDAREAL V_dot_dV = V.dot(_NABC.dot(delta_H_prime));
-                                    CUDAREAL value = -two_C * V_dot_dV * Iincrement;
-
+                            }  // end of eta man deriv
+
+                            // checkpoint for lambda manager
+                            for (int i_lam = 0; i_lam < 2; i_lam++) {
+                                if (refine_flag & (REFINE_LAMBDA << i_lam)) {
+                                    CUDAREAL NH_dot_V = (_NABC.dot(H_vec)).dot(V);
+                                    CUDAREAL dg_dlambda;
+                                    if (i_lam == 0)
+                                        dg_dlambda = 1;
+                                    else  // i_lam==1
+                                        dg_dlambda = lambda_ang;
+                                    CUDAREAL coef =
+                                        NH_dot_V * two_C * (dg_dlambda) / lambda_ang;
+                                    CUDAREAL value = coef * Iincrement;
                                     CUDAREAL value2 = 0;
-                                    if (compute_curvatures) {
-                                        KOKKOS_VEC3 delta_H_dbl_prime =
-                                            (UmosRxRy.dot(d2RotMats(2).dot(UBOt)))
-                                                .transpose()
-                                                .dot(q_vec);
-                                        CUDAREAL dV_dot_dV = (_NABC.dot(delta_H_prime))
-                                                                 .dot(_NABC.dot(delta_H_prime));
-                                        CUDAREAL dV2_dot_V =
-                                            (_NABC.dot(delta_H)).dot(_NABC.dot(delta_H_dbl_prime));
-                                        value2 =
-                                            two_C *
-                                            (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) *
-                                            Iincrement;
-                                    }
-                                    rot_manager_dI[2] += value;
-                                    rot_manager_dI2[2] += value2;
+                                    dI.lambda[i_lam] += value;
+                                    dI2.lambda[i_lam] += value2;
                                 }
-                                // Checkpoint for unit cell derivatives
-                                // KOKKOS_MAT3 Ot = eig_O.transpose();
-                                KOKKOS_MAT3 UmosRxRyRzU;
-                                KOKKOS_VEC3 delta_H_prime;
-                                for (int i_uc = 0; i_uc < 6; i_uc++) {
-                                    if (refine_Bmat(i_uc)) {
-                                        UmosRxRyRzU = UMATS_RXYZ(_mos_tic).dot(eig_U);
-                                        delta_H_prime =
-                                            (UmosRxRyRzU.dot(dB_mats(i_uc).dot(eig_Otranspose)))
-                                                .transpose()
-                                                .dot(q_vec);
-                                        CUDAREAL V_dot_dV = V.dot(_NABC.dot(delta_H_prime));
-                                        CUDAREAL value = -two_C * V_dot_dV * Iincrement;
-                                        CUDAREAL value2 = 0;
-                                        if (compute_curvatures) {
-                                            KOKKOS_VEC3 delta_H_dbl_prime =
-                                                (UmosRxRyRzU.dot(
-                                                     dB2_mats(i_uc).dot(eig_Otranspose)))
-                                                    .transpose()
-                                                    .dot(q_vec);
-                                            CUDAREAL dV_dot_dV = (_NABC.dot(delta_H_prime))
-                                                                     .dot(_NABC.dot(delta_H_prime));
-                                            CUDAREAL dV2_dot_V =
-                                                (_NABC.dot(delta_H))
-                                                    .dot(_NABC.dot(delta_H_dbl_prime));
-                                            value2 = two_C *
-                                                     (two_C * V_dot_dV * V_dot_dV - dV2_dot_V -
-                                                      dV_dot_dV) *
-                                                     Iincrement;
-                                        }
-                                        ucell_manager_dI[i_uc] += value;
-                                        ucell_manager_dI2[i_uc] += value2;
+                            }
+                            // end of lambda deriv
+                            if (printout) {
+                                if (_subS == 0 && _subF == 0 && _thick_tic == 0 &&
+                                    _source == 0 && _mos_tic == 0) {
+                                    if ((_fpixel == printout_fpixel &&
+                                            _spixel == printout_spixel) ||
+                                        printout_fpixel < 0) {
+                                        printf("%4d %4d :  lambda = %g\n", _fpixel, _spixel, _lambda);
+                                        printf(
+                                            "at %g %g %g\n", _pixel_pos[0], _pixel_pos[1],
+                                            _pixel_pos[2]);
+                                        printf("Fdet= %g; Sdet= %g ; Odet= %g\n", _Fdet, _Sdet, _Odet);
+                                        printf(
+                                            "PIX0: %f %f %f\n", pix0_vectors(pid_x),
+                                            pix0_vectors(pid_y), pix0_vectors(pid_z));
+                                        printf(
+                                            "F: %f %f %f\n", fdet_vectors(pid_x),
+                                            fdet_vectors(pid_y), fdet_vectors(pid_z));
+                                        printf(
+                                            "S: %f %f %f\n", sdet_vectors(pid_x),
+                                            sdet_vectors(pid_y), sdet_vectors(pid_z));
+                                        printf(
+                                            "O: %f %f %f\n", odet_vectors(pid_x),
+                                            odet_vectors(pid_y), odet_vectors(pid_z));
+                                        printf("pid_x=%d, pid_y=%d; pid_z=%d\n", pid_x, pid_y, pid_z);
+                                        printf(
+                                            "QVECTOR: %f %f %f\n", q_vec[0], q_vec[1], q_vec[2]);
+                                        printf("omega   %15.10g\n", _omega_pixel);
+                                        printf(
+                                            "Incident: %g %g %g\n",
+                                            _incident[0], _incident[1], _incident[2]);
+
+                                        KOKKOS_MAT3 UU = UMATS_RXYZ(_mos_tic);
+                                        printf(
+                                            "UMAT_RXYZ :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        UU = Bmat_realspace;
+                                        printf(
+                                            "Bmat_realspace :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        UU = UBO;
+                                        printf(
+                                            "UBO :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+
+                                        UU = UBOt;
+                                        printf(
+                                            "UBOt :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+
+                                        // UU = UmosRxRyRzU;
+                                        // printf(
+                                        //     "UmosRxRyRzU :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                        //     UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                        //     UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        // KOKKOS_VEC3 AA = delta_H_prime;
+                                        // printf(
+                                        //     "delta_H_prime :\n%f  %f  %f\n", AA[0], AA[1],
+                                        //     AA[2]);
+                                        printf("Iincrement: %f\n", Iincrement);
+                                        printf(
+                                            "hkl= %f %f %f  hkl0= %d %d %d\n", _h, _k, _l, _h0,
+                                            _k0, _l0);
+                                        printf(
+                                            " F_cell=%g  F_cell2=%g I_latt=%g   I = %g\n",
+                                            _F_cell, _F_cell2, I0, _I);
+                                        printf("I/steps %15.10g\n", _I / Nsteps);
+                                        // printf("Ilatt diffuse %15.10g\n", I_latt_diffuse);
+                                        printf("default_F= %f\n", default_F);
+                                        if (complex_miller)
+                                            printf("COMPLEX MILLER!\n");
+                                        if (no_Nabc_scale)
+                                            printf("No Nabc scale!\n");
                                     }
-                                }  // end ucell deriv
-
-                                // Checkpoint for Ncells manager
-                                if (refine_Ncells(0)) {
-                                    int num_ncell_deriv = 1;
-                                    if (!isotropic_ncells)
-                                        num_ncell_deriv = 3;
-                                    for (int i_nc = 0; i_nc < num_ncell_deriv; i_nc++) {
-                                        KOKKOS_MAT3 dN;
-                                        dN(i_nc, i_nc) = 1;
-                                        if (num_ncell_deriv == 1) {
-                                            dN(0, 0) = 1;
-                                            dN(1, 1) = 1;
-                                            dN(2, 2) = 1;
-                                        }
-                                        CUDAREAL N_i = _NABC(i_nc, i_nc);
-                                        KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
-                                        // TODO speedops: precompute these, store shared var
-                                        // _NABC.inverse
-                                        CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
-                                        CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
-                                        CUDAREAL value = 2 * Iincrement * deriv_coef;
-                                        CUDAREAL value2 = 0;
-                                        if (compute_curvatures) {
-                                            dN(i_nc, i_nc) = 0;  // TODO check maths
-                                            value2 = (-1 / N_i / N_i - C * (dV_dN.dot(dV_dN))) * 2 *
-                                                     Iincrement;
-                                            value2 += deriv_coef * 2 * value;
-                                        }
-                                        Ncells_manager_dI[i_nc] += value;
-                                        Ncells_manager_dI2[i_nc] += value2;
+                                }
+                            } // end of printout if
+
+                        } // end of mos_tic loop
+                    } // end of source loop
+                } // end of thick step loop
+            } // end of fpos loop
+        } // end of spos loop
+        floatimage(pixIdx) = _I;
+        if (save_wavelenimage)
+            wavelenimage(pixIdx) = Ilambda / _I;
+
+        if (refine_flag) {
+            manager_dI(pixIdx) = dI;
+            manager_dI2(pixIdx) = dI2;
+        }
+    }); // end pixIdx loop
+
+    if (Fhkl_gradient_mode)
+        return;
+
+    Kokkos::parallel_for(
+        "deriv_image_increment", Npix_to_model, KOKKOS_LAMBDA(const int& pixIdx) {
+
+        int _pid = panels_fasts_slows(pixIdx * 3);
+        int _fpixel = panels_fasts_slows(pixIdx * 3 + 1);
+        int _spixel = panels_fasts_slows(pixIdx * 3 + 2);
+
+        CUDAREAL _Fdet_ave = pixel_size * _fpixel + pixel_size / 2.0;
+        CUDAREAL _Sdet_ave = pixel_size * _spixel + pixel_size / 2.0;
+        CUDAREAL _Odet_ave = 0;  // Odet;
+        // TODO maybe make this more general for thick detectors?
+
+        KOKKOS_VEC3 _pixel_pos_ave(0, 0, 0);
+        int pid_x = _pid * 3;
+        int pid_y = _pid * 3 + 1;
+        int pid_z = _pid * 3 + 2;
+
+        CUDAREAL fx = fdet_vectors(pid_x);
+        CUDAREAL fy = fdet_vectors(pid_y);
+        CUDAREAL fz = fdet_vectors(pid_z);
+
+        CUDAREAL sx = sdet_vectors(pid_x);
+        CUDAREAL sy = sdet_vectors(pid_y);
+        CUDAREAL sz = sdet_vectors(pid_z);
+
+        CUDAREAL ox = odet_vectors(pid_x);
+        CUDAREAL oy = odet_vectors(pid_y);
+        CUDAREAL oz = odet_vectors(pid_z);
+
+        CUDAREAL px = pix0_vectors(pid_x);
+        CUDAREAL py = pix0_vectors(pid_y);
+        CUDAREAL pz = pix0_vectors(pid_z);
+
+        _pixel_pos_ave[0] = _Fdet_ave * fx + _Sdet_ave * sx + _Odet_ave * ox + px;
+        _pixel_pos_ave[1] = _Fdet_ave * fy + _Sdet_ave * sy + _Odet_ave * oy + py;
+        _pixel_pos_ave[2] = _Fdet_ave * fz + _Sdet_ave * sz + _Odet_ave * oz + pz;
+
+        CUDAREAL close_distance = close_distances(_pid);
+
+        CUDAREAL _airpath_ave_r = 1 / _pixel_pos_ave.length();
+        KOKKOS_VEC3 _diffracted_ave = _pixel_pos_ave.get_unit_vector();
+        CUDAREAL _omega_pixel_ave = pixel_size * pixel_size * _airpath_ave_r * _airpath_ave_r *
+                                    close_distance * _airpath_ave_r;
+
+        CUDAREAL _polar = 1;
+        if (!nopolar) {
+            KOKKOS_VEC3 _incident(-source_X(0), -source_Y(0), -source_Z(0));
+            _incident.normalize();
+            // component of diffracted unit vector along _incident beam unit vector
+            CUDAREAL cos2theta = _incident.dot(_diffracted_ave);
+            CUDAREAL cos2theta_sqr = cos2theta * cos2theta;
+            CUDAREAL sin2theta_sqr = 1 - cos2theta_sqr;
+
+            CUDAREAL cos2psi = 0;
+            if (kahn_factor != 0.0) {
+                // cross product to get "vertical" axis that is orthogonal to the cannonical
+                // "polarization"
+                KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
+                // cross product with _incident beam to get E-vector direction
+                KOKKOS_VEC3 E_in = _incident.cross(B_in);
+                // get components of diffracted ray projected onto the E-B plane
+                CUDAREAL _kEi = _diffracted_ave.dot(E_in);
+                CUDAREAL _kBi = _diffracted_ave.dot(B_in);
+                // compute the angle of the diffracted ray projected onto the incident E-B plane
+                // calculate cos(2 * atan2(_kBi, _kEi))
+                if (_kEi!=0) {
+                    CUDAREAL ratio = _kBi / _kEi;
+                    cos2psi = (1 - ratio*ratio) / (1 + ratio*ratio);
+                } else {
+                    cos2psi = -1;
+                }
+            }
+            // correction for polarized _incident beam
+            _polar = 0.5 * (1.0 + cos2theta_sqr - kahn_factor * cos2psi * sin2theta_sqr);
+        }
+
+        CUDAREAL _om = 1;
+        if (!oversample_omega)
+            _om = _omega_pixel_ave;
+        // final scale term to being everything to photon number units
+        CUDAREAL _scale_term = _polar * _om * overall_scale;
+        floatimage(pixIdx) *= _scale_term;
+
+        auto& dI = manager_dI(pixIdx);
+        auto& dI2 = manager_dI2(pixIdx);
+
+        // udpate the rotation derivative images*
+        for (int i_rot = 0; i_rot < 3; i_rot++) {
+            if (refine_flag & (REFINE_UMAT1 << i_rot)) {
+                CUDAREAL value = _scale_term * dI.rot[i_rot];
+                CUDAREAL value2 = _scale_term * dI2.rot[i_rot];
+                int idx = i_rot * Npix_to_model + pixIdx;
+                d_Umat_images(idx) = value;
+                d2_Umat_images(idx) = value2;
+            }
+        }  // end rot deriv image increment
+
+        // update the ucell derivative images
+        for (int i_uc = 0; i_uc < 6; i_uc++) {
+            if (refine_flag & (REFINE_BMAT1 << i_uc)) {
+                CUDAREAL value = _scale_term * dI.ucell[i_uc];
+                CUDAREAL value2 = _scale_term * dI2.ucell[i_uc];
+                int idx = i_uc * Npix_to_model + pixIdx;
+                d_Bmat_images(idx) = value;
+                d2_Bmat_images(idx) = value2;
+            }
+        }  // end ucell deriv image increment
+
+        // update the Ncells derivative image
+        if (refine_flag & REFINE_NCELLS1) {
+            CUDAREAL value = _scale_term * dI.Ncells[0];
+            CUDAREAL value2 = _scale_term * dI2.Ncells[0];
+            int idx = pixIdx;
+            d_Ncells_images(idx) = value;
+            d2_Ncells_images(idx) = value2;
+
+            if (!isotropic_ncells) {
+                value = _scale_term * dI.Ncells[1];
+                value2 = _scale_term * dI2.Ncells[1];
+                idx = Npix_to_model + pixIdx;
+                d_Ncells_images(idx) = value;
+                d2_Ncells_images(idx) = value2;
+
+                value = _scale_term * dI.Ncells[2];
+                value2 = _scale_term * dI2.Ncells[2];
+                idx = Npix_to_model * 2 + pixIdx;
+                d_Ncells_images(idx) = value;
+                d2_Ncells_images(idx) = value2;
+            }
+        }  // end Ncells deriv image increment
+        if (refine_flag & REFINE_NCELLS_DEF) {
+            for (int i_nc = 3; i_nc < 6; i_nc++) {
+                CUDAREAL value = _scale_term * dI.Ncells[i_nc];
+                CUDAREAL value2 = _scale_term * dI2.Ncells[i_nc];
+                int idx = i_nc * Npix_to_model + pixIdx;
+                d_Ncells_images(idx) = value;
+                d2_Ncells_images(idx) = value2;
+            }
+        }
+
+        // update Fcell derivative image
+        if (refine_flag & REFINE_FCELL) {
+            CUDAREAL value = _scale_term * dI.fcell;
+            CUDAREAL value2 = _scale_term * dI2.fcell;
+            d_fcell_images(pixIdx) = value;
+            d2_fcell_images(pixIdx) = value2;
+        }  // end Fcell deriv image increment
+
+        if (refine_flag & REFINE_FP_FDP) {
+            // c derivative
+            CUDAREAL value = _scale_term * dI.fp_fdp[0];
+            d_fp_fdp_images(pixIdx) = value;
+            // d derivative
+            value = _scale_term * dI.fp_fdp[1];
+            d_fp_fdp_images(Npix_to_model + pixIdx) = value;
+        }
+        if (refine_flag & REFINE_DIFFUSE) {
+            for (int i_gam = 0; i_gam < 3; i_gam++) {
+                CUDAREAL val = dI.diffuse[i_gam] * _scale_term;
+                int img_idx = Npix_to_model * i_gam + pixIdx;
+                d_diffuse_gamma_images(img_idx) = val;
+            }
+            for (int i_sig = 0; i_sig < 3; i_sig++) {
+                CUDAREAL val = dI.diffuse[i_sig + 3] * _scale_term;
+                int img_idx = Npix_to_model * i_sig + pixIdx;
+                d_diffuse_sigma_images(img_idx) = val;
+            }
+        }
+
+        // update eta derivative image
+        if (refine_flag & REFINE_ETA) {
+            for (int i_eta = 0; i_eta < 3; i_eta++) {
+                if (i_eta > 0 && !aniso_eta)
+                    continue;
+                int idx = pixIdx + Npix_to_model * i_eta;
+                CUDAREAL value = _scale_term * dI.eta[i_eta];
+                CUDAREAL value2 = _scale_term * dI2.eta[i_eta];
+                d_eta_images(idx) = value;
+                d2_eta_images(idx) = value2;
+            }
+        }  // end eta deriv image increment
+
+        // update the lambda derivative images
+        for (int i_lam = 0; i_lam < 2; i_lam++) {
+            if (refine_flag & (REFINE_LAMBDA1 << i_lam)) {
+                CUDAREAL value = _scale_term * dI.lambda[i_lam];
+                CUDAREAL value2 = _scale_term * dI2.lambda[i_lam];
+                int idx = i_lam * Npix_to_model + pixIdx;
+                d_lambda_images(idx) = value;
+                // d2_lambda_images(idx) = value2;
+            }
+        }  // end lambda deriv image increment
+
+        for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
+            if (refine_flag & (REFINE_PANEL_ROT1 << i_pan_rot)) {
+                CUDAREAL value = _scale_term * dI.pan_rot[i_pan_rot];
+                CUDAREAL value2 = _scale_term * dI2.pan_rot[i_pan_rot];
+                int idx = i_pan_rot * Npix_to_model + pixIdx;
+                d_panel_rot_images(idx) = value;
+                // d2_panel_rot_images(idx) = value2;
+            }
+        }  // end panel rot deriv image increment
+
+        for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
+            if (refine_flag & (REFINE_PANEL_ORIGIN1 << i_pan_orig)) {
+                CUDAREAL value = _scale_term * dI.pan_orig[i_pan_orig];
+                CUDAREAL value2 = _scale_term * dI2.pan_orig[i_pan_orig];
+                int idx = i_pan_orig * Npix_to_model + pixIdx;
+                d_panel_orig_images(idx) = value;
+                // d2_panel_orig_images(idx) = value2;
+            }
+        }  // end panel orig deriv image increment
+    });    // end pixIdx loop
+
+}  // END of GPU kernel
+
+//////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool printout,
+    bool complex_miller,
+    bool compute_curvatures,
+    uint32_t refine_flag,
+    bool use_diffuse,
+    bool save_wavelenimage,
+    bool Fhkl_gradient_mode,
+    bool Fhkl_errors_mode,
+    bool using_trusted_mask,
+    bool Fhkl_channels_empty,
+    bool Fhkl_have_scale_factors>
+void kokkos_sum_over_steps(
+    int Npix_to_model,
+    vector_uint_t panels_fasts_slows,
+    vector_cudareal_t floatimage,
+    vector_cudareal_t wavelenimage,
+    vector_cudareal_t d_Umat_images,
+    vector_cudareal_t d2_Umat_images,
+    vector_cudareal_t d_Bmat_images,
+    vector_cudareal_t d2_Bmat_images,
+    vector_cudareal_t d_Ncells_images,
+    vector_cudareal_t d2_Ncells_images,
+    vector_cudareal_t d_fcell_images,
+    vector_cudareal_t d2_fcell_images,
+    vector_cudareal_t d_eta_images,
+    vector_cudareal_t d2_eta_images,
+    vector_cudareal_t d_lambda_images,
+    vector_cudareal_t d2_lambda_images,
+    vector_cudareal_t d_panel_rot_images,
+    vector_cudareal_t d2_panel_rot_images,
+    vector_cudareal_t d_panel_orig_images,
+    vector_cudareal_t d2_panel_orig_images,
+    vector_cudareal_t d_fp_fdp_images,
+    vector_manager_t manager_dI,
+    vector_manager_t manager_dI2,
+    const int Nsteps,
+    int printout_fpixel,
+    int printout_spixel,
+    /*bool printout,*/
+    CUDAREAL default_F,
+    int oversample,
+    bool oversample_omega,
+    CUDAREAL subpixel_size,
+    CUDAREAL pixel_size,
+    CUDAREAL detector_thickstep,
+    CUDAREAL detector_thick,
+    const vector_cudareal_t close_distances,
+    CUDAREAL detector_attnlen,
+    int detector_thicksteps,
+    int sources,
+    int phisteps,
+    int mosaic_domains,
+    bool use_lambda_coefficients,
+    CUDAREAL lambda0,
+    CUDAREAL lambda1,
+    KOKKOS_MAT3 eig_U,
+    KOKKOS_MAT3 eig_O,
+    KOKKOS_MAT3 eig_B,
+    KOKKOS_MAT3 RXYZ,
+    vector_vec3_t dF_vecs,
+    vector_vec3_t dS_vecs,
+    const vector_mat3_t UMATS_RXYZ,
+    vector_mat3_t UMATS_RXYZ_prime,
+    vector_mat3_t UMATS_RXYZ_dbl_prime,
+    vector_mat3_t RotMats,
+    vector_mat3_t dRotMats,
+    vector_mat3_t d2RotMats,
+    vector_mat3_t UMATS,
+    vector_mat3_t dB_mats,
+    vector_mat3_t dB2_mats,
+    vector_mat3_t Amatrices,
+    const vector_cudareal_t source_X,
+    const vector_cudareal_t source_Y,
+    const vector_cudareal_t source_Z,
+    const vector_cudareal_t source_lambda,
+    const vector_cudareal_t source_I,
+    CUDAREAL kahn_factor,
+    CUDAREAL Na,
+    CUDAREAL Nb,
+    CUDAREAL Nc,
+    CUDAREAL Nd,
+    CUDAREAL Ne,
+    CUDAREAL Nf,
+    CUDAREAL phi0,
+    CUDAREAL phistep,
+    KOKKOS_VEC3 spindle_vec,
+    KOKKOS_VEC3 polarization_axis,
+    int h_range,
+    int k_range,
+    int l_range,
+    int h_max,
+    int h_min,
+    int k_max,
+    int k_min,
+    int l_max,
+    int l_min,
+    CUDAREAL dmin,
+    CUDAREAL fudge,
+    /*bool complex_miller,*/
+    int verbose,
+    bool only_save_omega_kahn,
+    bool isotropic_ncells,
+    /*bool compute_curvatures,*/
+    const vector_cudareal_t FhklLinear,
+    const vector_cudareal_t Fhkl2Linear,
+    /*const uint32_t refine_flag,*/
+    // vector_bool_t refine_Bmat,
+    // vector_bool_t refine_Ncells,
+    // bool refine_Ncells_def,
+    // vector_bool_t refine_panel_origin,
+    // vector_bool_t refine_panel_rot,
+    // bool refine_fcell,
+    // vector_bool_t refine_lambda,
+    // bool refine_eta,
+    // vector_bool_t refine_Umat,
+    const vector_cudareal_t fdet_vectors,
+    const vector_cudareal_t sdet_vectors,
+    const vector_cudareal_t odet_vectors,
+    const vector_cudareal_t pix0_vectors,
+    bool nopolar,
+    bool point_pixel,
+    CUDAREAL fluence,
+    CUDAREAL r_e_sqr,
+    CUDAREAL spot_scale,
+    int Npanels,
+    bool aniso_eta,
+    bool no_Nabc_scale,
+    const vector_cudareal_t fpfdp,
+    const vector_cudareal_t fpfdp_derivs,
+    const vector_cudareal_t atom_data,
+    int num_atoms,
+    // bool refine_fp_fdp,
+    const vector_int_t nominal_hkl,
+    bool use_nominal_hkl,
+    KOKKOS_MAT3 anisoU,
+    KOKKOS_MAT3 anisoG,
+    KOKKOS_MAT3 rotate_principal_axes,
+    /*bool use_diffuse,*/
+    vector_cudareal_t d_diffuse_gamma_images,
+    vector_cudareal_t d_diffuse_sigma_images,
+    // bool refine_diffuse,
+    bool gamma_miller_units,
+    // bool refine_Icell,
+    /*bool save_wavelenimage,*/
+    int laue_group_num,
+    int stencil_size,
+    /*bool Fhkl_gradient_mode,*/
+    /*bool Fhkl_errors_mode,*/
+    /*bool using_trusted_mask,*/
+    /*bool Fhkl_channels_empty,*/
+    /*bool Fhkl_have_scale_factors,*/
+    int Num_ASU,
+    const vector_cudareal_t data_residual,
+    const vector_cudareal_t data_variance,
+    const vector_int_t data_freq,
+    const vector_bool_t data_trusted,
+    const vector_int_t FhklLinear_ASUid,
+    const vector_int_t Fhkl_channels,
+    const vector_cudareal_t Fhkl_scale,
+    vector_cudareal_t Fhkl_scale_deriv) {  // BEGIN GPU kernel
+
+    const KOKKOS_MAT3 Bmat_realspace = eig_B * 1e10;
+    const KOKKOS_MAT3 eig_Otranspose = eig_O.transpose();
+    const KOKKOS_MAT3 Amat_init = eig_U * Bmat_realspace * eig_Otranspose;
+    const KOKKOS_MAT3 Ainv = eig_U*(Bmat_realspace.transpose().inverse())* (eig_O.inverse());
+    const KOKKOS_MAT3 _NABC {Na, Nd, Nf, Nd, Nb, Ne, Nf, Ne, Nc};
+    const double NABC_det = _NABC.determinant();  // TODO is this slow ?
+    const double NABC_det_sq = NABC_det * NABC_det;
+    const CUDAREAL C = 2 / 0.63 * fudge;
+    const CUDAREAL two_C = 2 * C;
+    KOKKOS_MAT3 anisoG_local;
+    CUDAREAL anisoG_determ = 0;
+    KOKKOS_MAT3 anisoU_local;
+    vector_mat3_t laue_mats = vector_mat3_t("laue_mats", 24);
+    vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam", 3);
+    vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace", 3);
+    int num_laue_mats = 0;
+    int dhh = 0, dkk = 0, dll = 0;
+
+    Kokkos::View<KOKKOS_MAT3*[3]> UMATS_prime("UMATS_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[3]> UMATS_dbl_prime("UMATS_dbl_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[6]> BMATS_prime("BMATS_prime", mosaic_domains);
+    Kokkos::View<KOKKOS_MAT3*[6]> BMATS_dbl_prime("BMATS_dbl_prime", mosaic_domains);
+
+    Kokkos::parallel_for("prepare_UMATS", mosaic_domains, KOKKOS_LAMBDA(const int& _mos_tic) {
+        const KOKKOS_MAT3 UBOt = Amat_init;
+        UMATS_prime(_mos_tic, 0) = _NABC * (UMATS(_mos_tic) * dRotMats(0) * RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_prime(_mos_tic, 1) = _NABC * (UMATS(_mos_tic) * RotMats(0) * dRotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_prime(_mos_tic, 2) = _NABC * (UMATS(_mos_tic) * RotMats(0) * RotMats(1) * dRotMats(2) * UBOt).transpose();
+
+        UMATS_dbl_prime(_mos_tic, 0) = _NABC * (UMATS(_mos_tic) * d2RotMats(0) * RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_dbl_prime(_mos_tic, 1) = _NABC * (UMATS(_mos_tic) * RotMats(0) * d2RotMats(1) * RotMats(2) * UBOt).transpose();
+        UMATS_dbl_prime(_mos_tic, 2) = _NABC * (UMATS(_mos_tic) * RotMats(0) * RotMats(1) * d2RotMats(2) * UBOt).transpose();
+
+        for (int i_uc=0; i_uc<6; i_uc++) {
+            BMATS_prime(_mos_tic, i_uc) = _NABC * (UMATS_RXYZ(_mos_tic) * eig_U * dB_mats(i_uc) * eig_O.transpose()).transpose();
+            BMATS_dbl_prime(_mos_tic, i_uc) = _NABC * (UMATS_RXYZ(_mos_tic) * eig_U * dB2_mats(i_uc) * eig_O.transpose()).transpose();
+        }
+    });
+
+    if (use_diffuse){
+        anisoG_local = anisoG;
+        anisoU_local = anisoU;
+
+        if (laue_group_num < 1 || laue_group_num >14 ){
+            throw std::string("Laue group number not in range 1-14");
+        }
+
+        if (gamma_miller_units){
+            anisoG_local = anisoG_local * Bmat_realspace;
+        }
+        Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){
+            num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes);
+            // KOKKOS_MAT3 rotate_principal_axes;
+            // rotate_principal_axes << 0.70710678,  -0.70710678,  0., 0.70710678,  0.70710678,  0., 0.,  0., 1.;
+
+            for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){
+                laue_mats(iL) = Ainv * laue_mats(iL) * rotate_principal_axes;
+            }
+            // printf("Bmat =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", Bmat_realspace[i]);
+            // }
+            // printf("\n");
+            const KOKKOS_MAT3 Ginv = anisoG_local.inverse();
+            // printf("Ginv =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", Ginv[i]);
+            // }
+            // printf("\n");
+            const KOKKOS_MAT3 dG = Bmat_realspace * Ginv;
+            // printf("dG   =");
+            // for (int i=0; i<9; ++i) {
+            //     printf(" %g", dG[i]);
+            // }
+            // printf("\n");
+            for (int i_gam=0; i_gam<3; i_gam++){
+                if (gamma_miller_units) {
+                    dG_dgam(i_gam) = KOKKOS_VEC3(Bmat_realspace(i_gam, 0), Bmat_realspace(i_gam, 1), Bmat_realspace(i_gam, 2));
+                } else {
+                    dG_dgam(i_gam)[i_gam] = 1;
+                }
+                KOKKOS_MAT3 temp_dgam;
+                temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0];
+                temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1];
+                temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2];
+                dG_trace(i_gam) = (Ginv*temp_dgam).trace();
+                // printf("TRACE %g\n", dG_trace(i_gam));
+                // printf("dgam =");
+                // for (int i=0; i<9; ++i) {
+                //     printf(" %g", temp_dgam[i]);
+                // }
+                // printf("\n");
+
+                // dG(i_gam, i_gam);
+            }
+        }, num_laue_mats);
+        anisoG_determ = anisoG_local.determinant();
+        dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc
+    }
+    const KOKKOS_VEC3 dHH (dhh, dkk, dll);
+
+    const CUDAREAL overall_scale = r_e_sqr * spot_scale * fluence / Nsteps;
+
+    const CUDAREAL detector_attnlen_r = (detector_attnlen>0) ? 1 / detector_attnlen : 0;
+
+    Kokkos::parallel_for(
+        "sum_over_steps", Npix_to_model, KOKKOS_LAMBDA(const int& pixIdx) {
+
+        if (using_trusted_mask) {
+            if (!data_trusted(pixIdx))
+                return;
+        }
+        const int _pid = panels_fasts_slows(pixIdx * 3);
+        const int _fpixel = panels_fasts_slows(pixIdx * 3 + 1);
+        const int _spixel = panels_fasts_slows(pixIdx * 3 + 2);
+
+        CUDAREAL Fhkl_deriv_coef=0;
+        CUDAREAL Fhkl_hessian_coef=0;
+        if (Fhkl_gradient_mode) {
+            CUDAREAL u = data_residual(pixIdx);
+            CUDAREAL one_by_v = 1/data_variance(pixIdx);
+            CUDAREAL Gterm = 1 - 2*u - u*u*one_by_v;
+            Fhkl_deriv_coef = 0.5 * Gterm*one_by_v / data_freq(pixIdx);
+            if (Fhkl_errors_mode) {
+                Fhkl_hessian_coef = -0.5*one_by_v*(one_by_v*Gterm - 2  - 2*u*one_by_v -u*u*one_by_v*one_by_v)/data_freq(pixIdx);
+            }
+        }
+
+        // int fcell_idx=1;
+        int nom_h = 0, nom_k = 0, nom_l = 0;
+        if (use_nominal_hkl) {
+            nom_h = nominal_hkl(pixIdx * 3);
+            nom_k = nominal_hkl(pixIdx * 3 + 1);
+            nom_l = nominal_hkl(pixIdx * 3 + 2);
+        }
+        CUDAREAL close_distance = close_distances(_pid);
+
+        // reset photon count for this pixel
+        double _I = 0;
+        double Ilambda = 0;
+
+        kokkos_manager dI, dI2;
+        dI.reset();
+        dI2.reset();
+
+        for (int _subS = 0; _subS < oversample; ++_subS) {
+            for (int _subF = 0; _subF < oversample; ++_subF) {
+                // absolute mm position on detector (relative to its origin)
+                CUDAREAL _Fdet =
+                    subpixel_size * (_fpixel * oversample + _subF) + subpixel_size / 2.0;
+                CUDAREAL _Sdet =
+                    subpixel_size * (_spixel * oversample + _subS) + subpixel_size / 2.0;
+
+                // assume "distance" is to the front of the detector sensor layer
+                int pid_x = _pid * 3;
+                int pid_y = _pid * 3 + 1;
+                int pid_z = _pid * 3 + 2;
+
+
+                CUDAREAL fx = fdet_vectors(pid_x);
+                CUDAREAL fy = fdet_vectors(pid_y);
+                CUDAREAL fz = fdet_vectors(pid_z);
+                CUDAREAL sx = sdet_vectors(pid_x);
+                CUDAREAL sy = sdet_vectors(pid_y);
+                CUDAREAL sz = sdet_vectors(pid_z);
+                CUDAREAL ox = odet_vectors(pid_x);
+                CUDAREAL oy = odet_vectors(pid_y);
+                CUDAREAL oz = odet_vectors(pid_z);
+                CUDAREAL px = pix0_vectors(pid_x);
+                CUDAREAL py = pix0_vectors(pid_y);
+                CUDAREAL pz = pix0_vectors(pid_z);
+                KOKKOS_VEC3 _o_vec(ox, oy, oz);
+
+                for (int _thick_tic = 0; _thick_tic < detector_thicksteps; ++_thick_tic) {
+
+                    CUDAREAL _Odet = _thick_tic * detector_thickstep;
+
+                    CUDAREAL pixposX = _Fdet * fx + _Sdet * sx + _Odet * ox + px;
+                    CUDAREAL pixposY = _Fdet * fy + _Sdet * sy + _Odet * oy + py;
+                    CUDAREAL pixposZ = _Fdet * fz + _Sdet * sz + _Odet * oz + pz;
+                    KOKKOS_VEC3 _pixel_pos(pixposX, pixposY, pixposZ);
+
+                    CUDAREAL _airpath_r = 1 / _pixel_pos.length();
+                    KOKKOS_VEC3 _diffracted = _pixel_pos.get_unit_vector();
+
+                    const CUDAREAL close_distance = close_distances(_pid);
+
+                    // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
+                    CUDAREAL _omega_pixel = pixel_size * pixel_size * _airpath_r * _airpath_r *
+                                            close_distance * _airpath_r;
+
+                    // option to turn off obliquity effect, inverse-square-law only
+                    if (point_pixel)
+                        _omega_pixel = _airpath_r * _airpath_r;
+
+                    // now calculate detector thickness effects
+                    CUDAREAL _capture_fraction = 1;
+
+                    CUDAREAL previous_layer = 1.0;
+                    if (detector_thick > 0.0 && detector_attnlen_r > 0.0) {
+                        // inverse of effective thickness increase
+                        KOKKOS_VEC3 _o_vec(ox, oy, oz);
+                        CUDAREAL _parallax = _diffracted.dot(_o_vec);
+                        CUDAREAL current_layer = ::Kokkos::exp(
+                                                -(_thick_tic + 1) * detector_thickstep *
+                                                detector_attnlen_r / _parallax);
+                        _capture_fraction = previous_layer - current_layer;
+                        previous_layer = current_layer;
+                    }
+
+                    for (int _source = 0; _source < sources; ++_source) {
+
+                        KOKKOS_VEC3 _incident(
+                            -source_X(_source), -source_Y(_source), -source_Z(_source));
+                        CUDAREAL _lambda = source_lambda(_source);
+                        CUDAREAL sI = source_I(_source);
+                        CUDAREAL lambda_ang = _lambda * 1e10;
+                        if (use_lambda_coefficients) {
+                            lambda_ang = lambda0 + lambda1 * lambda_ang;
+                            _lambda = lambda_ang * 1e-10;
+                        }
+
+                        // polarization
+                        CUDAREAL polar_for_Fhkl_grad=1;
+                        if (!nopolar && Fhkl_gradient_mode){
+
+                            // component of diffracted unit vector along incident beam unit vector
+                            CUDAREAL cos2theta = _incident.dot(_diffracted);
+                            CUDAREAL cos2theta_sqr = cos2theta*cos2theta;
+                            CUDAREAL sin2theta_sqr = 1-cos2theta_sqr;
+
+                            CUDAREAL cos2psi=1;
+                            if(kahn_factor != 0.0){
+                                // cross product to get "vertical" axis that is orthogonal to the cannonical "polarization"
+                                KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
+                                // cross product with incident beam to get E-vector direction
+                                KOKKOS_VEC3 E_in = _incident.cross(B_in);
+                                // get components of diffracted ray projected onto the E-B plane
+                                CUDAREAL _kEi = _diffracted.dot(E_in);
+                                CUDAREAL _kBi = _diffracted.dot(B_in);
+                                // compute the angle of the diffracted ray projected onto the incident E-B plane
+                                // calculate cos(2 * atan2(_kBi, _kEi))
+                                if (_kEi!=0) {
+                                    CUDAREAL ratio = _kBi / _kEi;
+                                    cos2psi = (1 - ratio*ratio) / (1 + ratio*ratio);
+                                } else {
+                                    cos2psi = -1;
+                                }
+                            }
+                            // correction for polarized incident beam
+                            polar_for_Fhkl_grad = 0.5*(1.0 + cos2theta_sqr - kahn_factor*cos2psi*sin2theta_sqr);
+                        }
+                        KOKKOS_VEC3 _scattering = (_diffracted - _incident) / _lambda;
+
+                        KOKKOS_VEC3 q_vec = _scattering * 1e-10;
+
+                        // TODO rename
+                        CUDAREAL texture_scale = _capture_fraction * _omega_pixel * sI;
+
+                        for (int _mos_tic = 0; _mos_tic < mosaic_domains; ++_mos_tic) {
+                            const KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+
+                            KOKKOS_VEC3 H_vec = UBO * q_vec;
+                            CUDAREAL _h = H_vec[0];
+                            CUDAREAL _k = H_vec[1];
+                            CUDAREAL _l = H_vec[2];
+
+                            int _h0 = ceil(_h - 0.5);
+                            int _k0 = ceil(_k - 0.5);
+                            int _l0 = ceil(_l - 0.5);
+
+                            KOKKOS_VEC3 H0(_h0, _k0, _l0);
+
+                            KOKKOS_VEC3 delta_H = H_vec - H0;
+                            KOKKOS_VEC3 V = _NABC * delta_H;
+                            CUDAREAL _hrad_sqr = V.length_sqr();
+                            CUDAREAL exparg = _hrad_sqr * C / 2;
+                            CUDAREAL I0 = 0;
+
+                            if (exparg < 35)
+                                if (no_Nabc_scale)
+                                    I0 = ::Kokkos::exp(-2 * exparg);
+                                else
+                                    I0 = (NABC_det_sq) *
+                                            ::Kokkos::exp(-2 * exparg);
+
+                            // are we doing diffuse scattering
+                            CUDAREAL step_diffuse_param[6] = {0, 0, 0, 0, 0, 0};
+                            if (use_diffuse) {
+                                calc_diffuse_at_hkl(H_vec,H0,dHH,h_min,k_min,l_min,h_max,k_max,l_max,h_range,k_range,l_range,Ainv,FhklLinear,num_laue_mats,laue_mats,anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,(refine_flag & REFINE_DIFFUSE)>0,&I0,step_diffuse_param);
+                            } // end s_use_diffuse outer
+
+                            CUDAREAL _F_cell = default_F;
+                            CUDAREAL _F_cell2 = 0;
+                            int i_hklasu=0;
+
+                            if ((_h0 <= h_max) && (_h0 >= h_min) &&
+                                (_k0 <= k_max) && (_k0 >= k_min) &&
+                                (_l0 <= l_max) && (_l0 >= l_min)) {
+                                int Fhkl_linear_index = (_h0 - h_min) * k_range * l_range +
+                                                        (_k0 - k_min) * l_range + (_l0 - l_min);
+                                //_F_cell = __ldg(&FhklLinear[Fhkl_linear_index]);
+                                _F_cell = FhklLinear(Fhkl_linear_index);
+                                // if (complex_miller) _F_cell2 =
+                                // __ldg(&Fhkl2Linear[Fhkl_linear_index]);
+                                if (complex_miller)
+                                    _F_cell2 = Fhkl2Linear(Fhkl_linear_index);
+                                if (Fhkl_have_scale_factors)
+                                    i_hklasu = FhklLinear_ASUid(Fhkl_linear_index);
+                            }
+
+                            CUDAREAL c_deriv_Fcell = 0;
+                            CUDAREAL d_deriv_Fcell = 0;
+                            if (complex_miller) {
+                                CUDAREAL c_deriv_Fcell_real = 0;
+                                CUDAREAL c_deriv_Fcell_imag = 0;
+                                CUDAREAL d_deriv_Fcell_real = 0;
+                                CUDAREAL d_deriv_Fcell_imag = 0;
+                                if (num_atoms > 0) {
+                                    CUDAREAL S_2 = (q_vec[0] * q_vec[0] +
+                                                    q_vec[1] * q_vec[1] +
+                                                    q_vec[2] * q_vec[2]);
+
+                                    // fp is always followed by the fdp value
+                                    CUDAREAL val_fp = fpfdp(2 * _source);
+                                    CUDAREAL val_fdp = fpfdp(2 * _source + 1);
+
+                                    CUDAREAL c_deriv_prime = 0;
+                                    CUDAREAL c_deriv_dblprime = 0;
+                                    CUDAREAL d_deriv_prime = 0;
+                                    CUDAREAL d_deriv_dblprime = 0;
+                                    if (refine_flag & REFINE_FP_FDP) {
+                                        //   currently only supports two parameter model
+                                        int d_idx = 2 * _source;
+                                        c_deriv_prime = fpfdp_derivs(d_idx);
+                                        c_deriv_dblprime = fpfdp_derivs(d_idx + 1);
+                                        d_deriv_prime = fpfdp_derivs(d_idx + 2 * sources);
+                                        d_deriv_dblprime =
+                                            fpfdp_derivs(d_idx + 1 + 2 * sources);
                                     }
-                                }  // end Ncells manager deriv
-
-                                if (refine_Ncells_def) {
-                                    for (int i_nc = 3; i_nc < 6; i_nc++) {
-                                        KOKKOS_MAT3 dN;
-                                        if (i_nc == 3)
-                                            dN = KOKKOS_MAT3{0, 1, 0, 1, 0, 0, 0, 0, 0};
-                                        else if (i_nc == 4)
-                                            dN = KOKKOS_MAT3{0, 0, 0, 0, 0, 1, 0, 1, 0};
-                                        else
-                                            dN = KOKKOS_MAT3{0, 0, 1, 0, 0, 0, 1, 0, 0};
-                                        KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
-                                        // TODO speedops: precompute these
-                                        CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
-                                        CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
-                                        CUDAREAL value = 2 * Iincrement * deriv_coef;
-                                        Ncells_manager_dI[i_nc] += value;
-                                        CUDAREAL value2 = 0;
-                                        if (compute_curvatures) {
-                                            value2 = deriv_coef * value;
-                                            value2 += -2 * C * Iincrement * (dV_dN.dot(dV_dN));
-                                            Ncells_manager_dI2[i_nc] += value2;
+
+                                    for (int i_atom = 0; i_atom < num_atoms; i_atom++) {
+                                        // fractional atomic coordinates
+                                        CUDAREAL atom_x = atom_data(i_atom * 5);
+                                        CUDAREAL atom_y = atom_data(i_atom * 5 + 1);
+                                        CUDAREAL atom_z = atom_data(i_atom * 5 + 2);
+                                        CUDAREAL B = atom_data(i_atom * 5 + 3);  // B factor
+                                        B = ::Kokkos::exp(
+                                            -B * S_2 / 4.0);  // TODO: speed me up?
+                                        CUDAREAL occ = atom_data(i_atom * 5 + 4);  // occupancy
+                                        CUDAREAL r_dot_h =
+                                            _h0 * atom_x + _k0 * atom_y + _l0 * atom_z;
+                                        CUDAREAL phase = 2 * M_PI * r_dot_h;
+                                        CUDAREAL s_rdoth = ::Kokkos::sin(phase);
+                                        CUDAREAL c_rdoth = ::Kokkos::cos(phase);
+                                        CUDAREAL Bocc = B * occ;
+                                        CUDAREAL BC = B * c_rdoth;
+                                        CUDAREAL BS = B * s_rdoth;
+                                        CUDAREAL real_part = BC * val_fp - BS * val_fdp;
+                                        CUDAREAL imag_part = BS * val_fp + BC * val_fdp;
+                                        _F_cell += real_part;
+                                        _F_cell2 += imag_part;
+                                        if (refine_flag & REFINE_FP_FDP) {
+                                            c_deriv_Fcell_real +=
+                                                BC * c_deriv_prime - BS * c_deriv_dblprime;
+                                            c_deriv_Fcell_imag +=
+                                                BS * c_deriv_prime + BC * c_deriv_dblprime;
+
+                                            d_deriv_Fcell_real +=
+                                                BC * d_deriv_prime - BS * d_deriv_dblprime;
+                                            d_deriv_Fcell_imag +=
+                                                BS * d_deriv_prime + BC * d_deriv_dblprime;
                                         }
                                     }
                                 }
+                                CUDAREAL Freal = _F_cell;
+                                CUDAREAL Fimag = _F_cell2;
+                                _F_cell =
+                                    ::Kokkos::sqrt(Freal * Freal + Fimag * Fimag);
+                                if (refine_flag & REFINE_FP_FDP) {
+                                    c_deriv_Fcell =
+                                        Freal * c_deriv_Fcell_real + Fimag * c_deriv_Fcell_imag;
+                                    d_deriv_Fcell =
+                                        Freal * d_deriv_Fcell_real + Fimag * d_deriv_Fcell_imag;
+                                }
+                            }
+                            if (!oversample_omega && ! Fhkl_gradient_mode)
+                                _omega_pixel = 1;
+
+                            CUDAREAL _I_cell = _F_cell;
+                            if (!(refine_flag & REFINE_ICELL))
+                                _I_cell *= _F_cell;
+                            CUDAREAL hkl=1;
+                            int Fhkl_channel=0;
+                            if (! Fhkl_channels_empty)
+                                Fhkl_channel = Fhkl_channels(_source);
+                            if (Fhkl_have_scale_factors)
+                                hkl = Fhkl_scale(i_hklasu + Fhkl_channel*Num_ASU);
+                            if (Fhkl_gradient_mode){
+                                CUDAREAL Fhkl_deriv_scale = overall_scale*polar_for_Fhkl_grad;
+                                CUDAREAL I_noFcell=texture_scale*I0;
+                                CUDAREAL dfhkl = I_noFcell*_I_cell * Fhkl_deriv_scale;
+                                CUDAREAL grad_incr = dfhkl*Fhkl_deriv_coef;
+                                int fhkl_grad_idx=i_hklasu + Fhkl_channel*Num_ASU;
+
+                                if (Fhkl_errors_mode){
+                                    // here we hi-kack the Fhkl_scale_deriv array, if computing errors, in order to store the hessian terms
+                                    // if we are getting the hessian terms, we no longer need the  gradients (e.g. by this point we are done refininig)
+                                    CUDAREAL hessian_incr = Fhkl_hessian_coef*dfhkl*dfhkl;
+                                    ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), hessian_incr);
+                                }
+                                else{
+                                    ::Kokkos::atomic_add(&Fhkl_scale_deriv(fhkl_grad_idx), grad_incr);
+                                }
+                                continue;
+                            }
 
-                                // Checkpoint for Origin manager
-                                for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
-                                    if (refine_panel_origin(i_pan_orig)) {
-                                        CUDAREAL per_k = 1 / _airpath;
-                                        CUDAREAL per_k3 = pow(per_k, 3.);
-                                        CUDAREAL per_k5 = pow(per_k, 5.);
-                                        CUDAREAL lambda_ang = _lambda * 1e10;
-
-                                        KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
-                                        KOKKOS_VEC3 dk;
-                                        if (i_pan_orig == 0)
-                                            dk = KOKKOS_VEC3{0, 0, 1};
-                                        else if (i_pan_orig == 1)
-                                            dk = KOKKOS_VEC3{1, 0, 0};
-                                        else
-                                            dk = KOKKOS_VEC3{0, 1, 0};
-
-                                        CUDAREAL G = dk.dot(_pixel_pos);
-                                        CUDAREAL pix2 = subpixel_size * subpixel_size;
-                                        KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
-                                        CUDAREAL coef = (M.dot(dk_hat)).dot(V);
-                                        CUDAREAL coef2 =
-                                            -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
-                                        coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
-                                        CUDAREAL value =
-                                            coef * Iincrement + coef2 * Iincrement / _omega_pixel;
-
-                                        pan_orig_manager_dI[i_pan_orig] += value;
-                                        pan_orig_manager_dI2[i_pan_orig] += 0;
-
-                                    }  // end origin manager deriv
+                            CUDAREAL _I_total = hkl*_I_cell *I0;
+                            CUDAREAL Iincrement = _I_total * texture_scale;
+                            _I += Iincrement;
+                            if (save_wavelenimage)
+                                Ilambda += Iincrement * lambda_ang;
+
+                            if (refine_flag & REFINE_DIFFUSE) {
+                                CUDAREAL step_scale = texture_scale * _F_cell * _F_cell;
+                                for (int i_diff = 0; i_diff < 6; i_diff++) {
+                                    dI.diffuse[i_diff] +=
+                                        step_scale * step_diffuse_param[i_diff];
                                 }
+                            }
+
+                            //*************************************************
+                            // START REFINEMENT
 
-                                for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
-                                    if (refine_panel_rot(i_pan_rot)) {
-                                        CUDAREAL per_k = 1 / _airpath;
-                                        CUDAREAL per_k3 = pow(per_k, 3.);
-                                        CUDAREAL per_k5 = pow(per_k, 5.);
-                                        CUDAREAL lambda_ang = _lambda * 1e10;
-                                        KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
-                                        KOKKOS_VEC3 dk = _Fdet * (dF_vecs(_pid * 3 + i_pan_rot)) +
-                                                  _Sdet * (dS_vecs(_pid * 3 + i_pan_rot));
-                                        CUDAREAL G = dk.dot(_pixel_pos);
-                                        CUDAREAL pix2 = subpixel_size * subpixel_size;
-                                        KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
-                                        CUDAREAL coef = (M.dot(dk_hat)).dot(V);
-                                        CUDAREAL coef2 =
-                                            -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
-                                        coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
-                                        CUDAREAL value =
-                                            coef * Iincrement + coef2 * Iincrement / _omega_pixel;
-
-                                        pan_rot_manager_dI[i_pan_rot] += value;
-                                        pan_rot_manager_dI2[i_pan_rot] += 0;
+                            if (refine_flag & REFINE_FP_FDP) {
+                                CUDAREAL I_noFcell = texture_scale * I0;
+                                dI.fp_fdp[0] += 2 * I_noFcell * (c_deriv_Fcell);
+                                dI.fp_fdp[1] += 2 * I_noFcell * (d_deriv_Fcell);
+                            }
+
+                            if (verbose > 3)
+                                printf(
+                                    "hkl= %f %f %f  hkl1= %d %d %d  Fcell=%f\n", _h, _k, _l,
+                                    _h0, _k0, _l0, _F_cell);
+
+                            KOKKOS_MAT3 UBOt;
+                             if (refine_flag & (REFINE_UMAT | REFINE_ETA)) {
+                                UBOt = Amat_init;
+                            }
+                            if (refine_flag & REFINE_UMAT1) {
+                                const KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 0) * q_vec;
+                                const CUDAREAL V_dot_dV = V.dot(dV);
+                                const CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    const CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 0)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
+                                }
+                                dI.rot[0] += value;
+                                dI2.rot[0] += value2;
+                            }
+                            if (refine_flag & REFINE_UMAT2) {
+                                KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 1) * q_vec;
+                                CUDAREAL V_dot_dV = V.dot(dV);
+                                CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 1)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
+                                }
+                                dI.rot[1] += value;
+                                dI2.rot[1] += value2;
+                            }
+                            if (refine_flag & REFINE_UMAT3) {
+                                KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 2) * q_vec;
+                                CUDAREAL V_dot_dV = V.dot(dV);
+                                CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                    CUDAREAL dV2_dot_V = V.dot(UMATS_dbl_prime(_mos_tic, 2)*q_vec);
+                                    value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
+                                }
+                                dI.rot[2] += value;
+                                dI2.rot[2] += value2;
+                            }
+                            // Checkpoint for unit cell derivatives
+                            for (int i_uc = 0; i_uc < 6; i_uc++) {
+                                if (refine_flag & (REFINE_BMAT1 << i_uc)) {
+                                    KOKKOS_VEC3 dV = BMATS_prime(_mos_tic, i_uc) * q_vec;
+                                    CUDAREAL V_dot_dV = V.dot(dV);
+                                    CUDAREAL value = -two_C * V_dot_dV * Iincrement;
+                                    CUDAREAL value2 = 0;
+                                    if (compute_curvatures) {
+                                        const CUDAREAL dV_dot_dV = dV.length_sqr();
+                                        CUDAREAL dV2_dot_V = V.dot(BMATS_dbl_prime(_mos_tic, i_uc)*q_vec);
+                                        value2 = two_C * (two_C * V_dot_dV * V_dot_dV - dV2_dot_V - dV_dot_dV) * Iincrement;
                                     }
+                                    dI.ucell[i_uc] += value;
+                                    dI2.ucell[i_uc] += value2;
                                 }
-
-                                // checkpoint for Fcell manager
-                                if (refine_fcell) {
-                                    CUDAREAL value;
-                                    if (refine_Icell)
-                                        value = I0 * texture_scale;
-                                    else
-                                        value = 2 * I0 * _F_cell *
-                                                texture_scale;  // Iincrement/_F_cell ;
+                            }  // end ucell deriv
+
+                            // Checkpoint for Ncells manager
+                            if (refine_flag & REFINE_NCELLS1) {
+                                int num_ncell_deriv = 1;
+                                if (!isotropic_ncells)
+                                    num_ncell_deriv = 3;
+                                for (int i_nc = 0; i_nc < num_ncell_deriv; i_nc++) {
+                                    KOKKOS_MAT3 dN;
+                                    dN(i_nc, i_nc) = 1;
+                                    if (num_ncell_deriv == 1) {
+                                        dN(0, 0) = 1;
+                                        dN(1, 1) = 1;
+                                        dN(2, 2) = 1;
+                                    }
+                                    CUDAREAL N_i = _NABC(i_nc, i_nc);
+                                    KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
+                                    // TODO speedops: precompute these, store shared var
+                                    // _NABC.inverse
+                                    CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
+                                    CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
+                                    CUDAREAL value = 2 * Iincrement * deriv_coef;
                                     CUDAREAL value2 = 0;
                                     if (compute_curvatures) {
-                                        //    NOTE if _Fcell >0
-                                        value2 = 2 * I0 * texture_scale;
+                                        value2 = (-1 / N_i / N_i - C * (dV_dN.dot(dV_dN))) * 2 *
+                                                    Iincrement;
+                                        value2 += deriv_coef * 2 * value;
                                     }
-                                    // if (fcell_idx >=0 && fcell_idx <=2){
-                                    if (use_nominal_hkl) {
-                                        if (_h0 == nom_h && _k0 == nom_k && _l0 == nom_l) {
-                                            fcell_manager_dI += value;
-                                            fcell_manager_dI2 += value2;
-                                        }
-                                    } else {
-                                        fcell_manager_dI += value;
-                                        fcell_manager_dI2 += value2;
+                                    dI.Ncells[i_nc] += value;
+                                    dI2.Ncells[i_nc] += value2;
+                                }
+                            }  // end Ncells manager deriv
+
+                            if (refine_flag & REFINE_NCELLS_DEF) {
+                                for (int i_nc = 3; i_nc < 6; i_nc++) {
+                                    KOKKOS_MAT3 dN;
+                                    if (i_nc == 3)
+                                        dN = KOKKOS_MAT3{0, 1, 0, 1, 0, 0, 0, 0, 0};
+                                    else if (i_nc == 4)
+                                        dN = KOKKOS_MAT3{0, 0, 0, 0, 0, 1, 0, 1, 0};
+                                    else
+                                        dN = KOKKOS_MAT3{0, 0, 1, 0, 0, 0, 1, 0, 0};
+                                    KOKKOS_VEC3 dV_dN = dN.dot(delta_H);
+                                    // TODO speedops: precompute these
+                                    CUDAREAL determ_deriv = (_NABC.inverse().dot(dN)).trace();
+                                    CUDAREAL deriv_coef = determ_deriv - C * (dV_dN.dot(V));
+                                    CUDAREAL value = 2 * Iincrement * deriv_coef;
+                                    dI.Ncells[i_nc] += value;
+                                    CUDAREAL value2 = 0;
+                                    if (compute_curvatures) {
+                                        value2 = deriv_coef * value;
+                                        value2 += -2 * C * Iincrement * (dV_dN.dot(dV_dN));
+                                        dI2.Ncells[i_nc] += value2;
                                     }
-                                }  // end of fcell man deriv
-
-                                // checkpoint for eta manager
-                                if (refine_eta) {
-                                    for (int i_eta = 0; i_eta < 3; i_eta++) {
-                                        if (i_eta > 0 && !aniso_eta)
-                                            continue;
-                                        int mtic2 = _mos_tic + i_eta * mosaic_domains;
-                                        KOKKOS_VEC3 DeltaH_deriv = (UMATS_RXYZ_prime(mtic2).dot(UBOt))
-                                                                .transpose()
-                                                                .dot(q_vec);
-                                        // vector V is _Nabc*Delta_H
-                                        KOKKOS_VEC3 dV = _NABC.dot(DeltaH_deriv);
-                                        CUDAREAL V_dot_dV = V.dot(dV);
-                                        CUDAREAL Iprime = -two_C * (V_dot_dV)*Iincrement;
-                                        eta_manager_dI[i_eta] += Iprime;
-                                        CUDAREAL Idbl_prime = 0;
-                                        if (compute_curvatures) {
-                                            KOKKOS_VEC3 DeltaH_second_deriv =
-                                                (UMATS_RXYZ_dbl_prime(mtic2).dot(UBOt))
-                                                    .transpose()
-                                                    .dot(q_vec);
-                                            KOKKOS_VEC3 dV2 = _NABC.dot(DeltaH_second_deriv);
-                                            Idbl_prime =
-                                                -two_C * (dV.dot(dV) + V.dot(dV2)) * Iincrement;
-                                            Idbl_prime += -two_C * (V_dot_dV)*Iprime;
-                                        }
-                                        eta_manager_dI2[i_eta] += Idbl_prime;
+                                }
+                            }
+
+                            // Checkpoint for Origin manager
+                            for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
+                                if (refine_flag & (REFINE_PANEL_ORIGIN1 << i_pan_orig)) {
+                                    CUDAREAL per_k = _airpath_r;
+                                    CUDAREAL per_k3 = pow(per_k, 3.);
+                                    CUDAREAL per_k5 = pow(per_k, 5.);
+
+                                    KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
+                                    KOKKOS_VEC3 dk;
+                                    if (i_pan_orig == 0)
+                                        dk = KOKKOS_VEC3{0, 0, 1};
+                                    else if (i_pan_orig == 1)
+                                        dk = KOKKOS_VEC3{1, 0, 0};
+                                    else
+                                        dk = KOKKOS_VEC3{0, 1, 0};
+
+                                    CUDAREAL G = dk.dot(_pixel_pos);
+                                    CUDAREAL pix2 = subpixel_size * subpixel_size;
+                                    KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
+                                    CUDAREAL coef = (M.dot(dk_hat)).dot(V);
+                                    CUDAREAL coef2 =
+                                        -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
+                                    coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
+                                    CUDAREAL value =
+                                        coef * Iincrement + coef2 * Iincrement / _omega_pixel;
+
+                                    dI.pan_orig[i_pan_orig] += value;
+                                    dI2.pan_orig[i_pan_orig] += 0;
+
+                                }  // end origin manager deriv
+                            }
+
+                            for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
+                                if (refine_flag & (REFINE_PANEL_ROT1 << i_pan_rot)) {
+                                    CUDAREAL per_k = _airpath_r;
+                                    CUDAREAL per_k3 = pow(per_k, 3.);
+                                    CUDAREAL per_k5 = pow(per_k, 5.);
+                                    KOKKOS_MAT3 M = -two_C * (_NABC.dot(UBO)) / lambda_ang;
+                                    KOKKOS_VEC3 dk = _Fdet * (dF_vecs(_pid * 3 + i_pan_rot)) +
+                                                _Sdet * (dS_vecs(_pid * 3 + i_pan_rot));
+                                    CUDAREAL G = dk.dot(_pixel_pos);
+                                    CUDAREAL pix2 = subpixel_size * subpixel_size;
+                                    KOKKOS_VEC3 dk_hat = -per_k3 * G * _pixel_pos + per_k * dk;
+                                    CUDAREAL coef = (M.dot(dk_hat)).dot(V);
+                                    CUDAREAL coef2 =
+                                        -3 * pix2 * per_k5 * G * (_o_vec.dot(_pixel_pos));
+                                    coef2 += pix2 * per_k3 * (_o_vec.dot(dk));
+                                    CUDAREAL value =
+                                        coef * Iincrement + coef2 * Iincrement / _omega_pixel;
+
+                                    dI.pan_rot[i_pan_rot] += value;
+                                    dI2.pan_rot[i_pan_rot] += 0;
+                                }
+                            }
+
+                            // checkpoint for Fcell manager
+                            if (refine_flag & REFINE_FCELL) {
+                                CUDAREAL value;
+                                if (refine_flag & REFINE_ICELL)
+                                    value = I0 * texture_scale;
+                                else
+                                    value = 2 * I0 * _F_cell *
+                                            texture_scale;  // Iincrement/_F_cell ;
+                                CUDAREAL value2 = 0;
+                                if (compute_curvatures) {
+                                    //    NOTE if _Fcell >0
+                                    value2 = 2 * I0 * texture_scale;
+                                }
+                                // if (fcell_idx >=0 && fcell_idx <=2){
+                                if (use_nominal_hkl) {
+                                    if (_h0 == nom_h && _k0 == nom_k && _l0 == nom_l) {
+                                        dI.fcell += value;
+                                        dI2.fcell += value2;
                                     }
-                                }  // end of eta man deriv
-
-                                // checkpoint for lambda manager
-                                for (int i_lam = 0; i_lam < 2; i_lam++) {
-                                    if (refine_lambda(i_lam)) {
-                                        CUDAREAL lambda_ang = _lambda * 1e10;
-                                        CUDAREAL NH_dot_V = (_NABC.dot(H_vec)).dot(V);
-                                        CUDAREAL dg_dlambda;
-                                        if (i_lam == 0)
-                                            dg_dlambda = 1;
-                                        else  // i_lam==1
-                                            dg_dlambda = lambda_ang;
-                                        CUDAREAL coef =
-                                            NH_dot_V * two_C * (dg_dlambda) / lambda_ang;
-                                        CUDAREAL value = coef * Iincrement;
-                                        CUDAREAL value2 = 0;
-                                        lambda_manager_dI[i_lam] += value;
-                                        lambda_manager_dI2[i_lam] += value2;
+                                } else {
+                                    dI.fcell += value;
+                                    dI2.fcell += value2;
+                                }
+                            }  // end of fcell man deriv
+
+                            // checkpoint for eta manager
+                            if (refine_flag & REFINE_ETA) {
+                                for (int i_eta = 0; i_eta < 3; i_eta++) {
+                                    if (i_eta > 0 && !aniso_eta)
+                                        continue;
+                                    int mtic2 = _mos_tic + i_eta * mosaic_domains;
+                                    KOKKOS_VEC3 DeltaH_deriv = (UMATS_RXYZ_prime(mtic2).dot(UBOt))
+                                                            .transpose()
+                                                            .dot(q_vec);
+                                    // vector V is _Nabc*Delta_H
+                                    KOKKOS_VEC3 dV = _NABC.dot(DeltaH_deriv);
+                                    CUDAREAL V_dot_dV = V.dot(dV);
+                                    CUDAREAL Iprime = -two_C * (V_dot_dV)*Iincrement;
+                                    dI.eta[i_eta] += Iprime;
+                                    CUDAREAL Idbl_prime = 0;
+                                    if (compute_curvatures) {
+                                        KOKKOS_VEC3 DeltaH_second_deriv =
+                                            (UMATS_RXYZ_dbl_prime(mtic2).dot(UBOt))
+                                                .transpose()
+                                                .dot(q_vec);
+                                        KOKKOS_VEC3 dV2 = _NABC.dot(DeltaH_second_deriv);
+                                        Idbl_prime =
+                                            -two_C * (dV.dot(dV) + V.dot(dV2)) * Iincrement;
+                                        Idbl_prime += -two_C * (V_dot_dV)*Iprime;
                                     }
+                                    dI2.eta[i_eta] += Idbl_prime;
                                 }
-                                // end of lambda deriv
-                                if (printout) {
-                                    if (_subS == 0 && _subF == 0 && _thick_tic == 0 &&
-                                        _source == 0 && _mos_tic == 0) {
-                                        if ((_fpixel == printout_fpixel &&
-                                             _spixel == printout_spixel) ||
-                                            printout_fpixel < 0) {
-                                            printf(
-                                                "%4d %4d :  lambda = %g\n", _fpixel, _spixel,
-                                                _lambda);
-                                            printf(
-                                                "at %g %g %g\n", _pixel_pos[0], _pixel_pos[1],
-                                                _pixel_pos[2]);
-                                            printf(
-                                                "Fdet= %g; Sdet= %g ; Odet= %g\n", _Fdet, _Sdet,
-                                                _Odet);
-                                            printf(
-                                                "PIX0: %f %f %f\n", pix0_vectors(pid_x),
-                                                pix0_vectors(pid_y), pix0_vectors(pid_z));
-                                            printf(
-                                                "F: %f %f %f\n", fdet_vectors(pid_x),
-                                                fdet_vectors(pid_y), fdet_vectors(pid_z));
-                                            printf(
-                                                "S: %f %f %f\n", sdet_vectors(pid_x),
-                                                sdet_vectors(pid_y), sdet_vectors(pid_z));
-                                            printf(
-                                                "O: %f %f %f\n", odet_vectors(pid_x),
-                                                odet_vectors(pid_y), odet_vectors(pid_z));
-                                            printf(
-                                                "pid_x=%d, pid_y=%d; pid_z=%d\n", pid_x, pid_y,
-                                                pid_z);
-
-                                            printf(
-                                                "QVECTOR: %f %f %f\n", q_vec[0], q_vec[1],
-                                                q_vec[2]);
-                                            KOKKOS_MAT3 UU = UMATS_RXYZ(_mos_tic);
-                                            printf(
-                                                "UMAT_RXYZ :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
-                                                UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
-                                                UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
-                                            UU = Bmat_realspace;
-                                            printf(
-                                                "Bmat_realspace :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
-                                                UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
-                                                UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
-                                            UU = UBO;
-                                            printf(
-                                                "UBO :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
-                                                UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
-                                                UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
-
-                                            UU = UBOt;
-                                            printf(
-                                                "UBOt :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
-                                                UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
-                                                UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
-
-                                            UU = UmosRxRyRzU;
-                                            printf(
-                                                "UmosRxRyRzU :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
-                                                UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
-                                                UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
-                                            KOKKOS_VEC3 AA = delta_H_prime;
-                                            printf(
-                                                "delta_H_prime :\n%f  %f  %f\n", AA[0], AA[1],
-                                                AA[2]);
-                                            printf("Iincrement: %f\n", Iincrement);
-                                            printf(
-                                                "hkl= %f %f %f  hkl0= %d %d %d\n", _h, _k, _l, _h0,
-                                                _k0, _l0);
-                                            printf(
-                                                " F_cell=%g  F_cell2=%g I_latt=%g   I = %g\n",
-                                                _F_cell, _F_cell2, I0, _I);
-                                            printf("I/steps %15.10g\n", _I / Nsteps);
-                                            // printf("Ilatt diffuse %15.10g\n", I_latt_diffuse);
-                                            printf("omega   %15.10g\n", _omega_pixel);
-                                            printf("default_F= %f\n", default_F);
-                                            printf(
-                                                "Incident[0]=%g, Incident[1]=%g, Incident[2]=%g\n",
-                                                _incident[0], _incident[1], _incident[2]);
-                                            if (complex_miller)
-                                                printf("COMPLEX MILLER!\n");
-                                            if (no_Nabc_scale)
-                                                printf("No Nabc scale!\n");
-                                        }
+                            }  // end of eta man deriv
+
+                            // checkpoint for lambda manager
+                            for (int i_lam = 0; i_lam < 2; i_lam++) {
+                                if (refine_flag & (REFINE_LAMBDA << i_lam)) {
+                                    CUDAREAL NH_dot_V = (_NABC.dot(H_vec)).dot(V);
+                                    CUDAREAL dg_dlambda;
+                                    if (i_lam == 0)
+                                        dg_dlambda = 1;
+                                    else  // i_lam==1
+                                        dg_dlambda = lambda_ang;
+                                    CUDAREAL coef =
+                                        NH_dot_V * two_C * (dg_dlambda) / lambda_ang;
+                                    CUDAREAL value = coef * Iincrement;
+                                    CUDAREAL value2 = 0;
+                                    dI.lambda[i_lam] += value;
+                                    dI2.lambda[i_lam] += value2;
+                                }
+                            }
+                            // end of lambda deriv
+                            if (printout) {
+                                if (_subS == 0 && _subF == 0 && _thick_tic == 0 &&
+                                    _source == 0 && _mos_tic == 0) {
+                                    if ((_fpixel == printout_fpixel &&
+                                            _spixel == printout_spixel) ||
+                                        printout_fpixel < 0) {
+                                        printf("%4d %4d :  lambda = %g\n", _fpixel, _spixel, _lambda);
+                                        printf(
+                                            "at %g %g %g\n", _pixel_pos[0], _pixel_pos[1],
+                                            _pixel_pos[2]);
+                                        printf("Fdet= %g; Sdet= %g ; Odet= %g\n", _Fdet, _Sdet, _Odet);
+                                        printf(
+                                            "PIX0: %f %f %f\n", pix0_vectors(pid_x),
+                                            pix0_vectors(pid_y), pix0_vectors(pid_z));
+                                        printf(
+                                            "F: %f %f %f\n", fdet_vectors(pid_x),
+                                            fdet_vectors(pid_y), fdet_vectors(pid_z));
+                                        printf(
+                                            "S: %f %f %f\n", sdet_vectors(pid_x),
+                                            sdet_vectors(pid_y), sdet_vectors(pid_z));
+                                        printf(
+                                            "O: %f %f %f\n", odet_vectors(pid_x),
+                                            odet_vectors(pid_y), odet_vectors(pid_z));
+                                        printf("pid_x=%d, pid_y=%d; pid_z=%d\n", pid_x, pid_y, pid_z);
+                                        printf(
+                                            "QVECTOR: %f %f %f\n", q_vec[0], q_vec[1], q_vec[2]);
+                                        printf("omega   %15.10g\n", _omega_pixel);
+                                        printf(
+                                            "Incident: %g %g %g\n",
+                                            _incident[0], _incident[1], _incident[2]);
+
+                                        KOKKOS_MAT3 UU = UMATS_RXYZ(_mos_tic);
+                                        printf(
+                                            "UMAT_RXYZ :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        UU = Bmat_realspace;
+                                        printf(
+                                            "Bmat_realspace :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        UU = UBO;
+                                        printf(
+                                            "UBO :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+
+                                        UU = UBOt;
+                                        printf(
+                                            "UBOt :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                            UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                            UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+
+                                        // UU = UmosRxRyRzU;
+                                        // printf(
+                                        //     "UmosRxRyRzU :\n%f  %f  %f\n%f  %f  %f\n%f  %f  %f\n",
+                                        //     UU(0, 0), UU(0, 1), UU(0, 2), UU(1, 0), UU(1, 1),
+                                        //     UU(1, 2), UU(2, 0), UU(2, 1), UU(2, 2));
+                                        // KOKKOS_VEC3 AA = delta_H_prime;
+                                        // printf(
+                                        //     "delta_H_prime :\n%f  %f  %f\n", AA[0], AA[1],
+                                        //     AA[2]);
+                                        printf("Iincrement: %f\n", Iincrement);
+                                        printf(
+                                            "hkl= %f %f %f  hkl0= %d %d %d\n", _h, _k, _l, _h0,
+                                            _k0, _l0);
+                                        printf(
+                                            " F_cell=%g  F_cell2=%g I_latt=%g   I = %g\n",
+                                            _F_cell, _F_cell2, I0, _I);
+                                        printf("I/steps %15.10g\n", _I / Nsteps);
+                                        // printf("Ilatt diffuse %15.10g\n", I_latt_diffuse);
+                                        printf("default_F= %f\n", default_F);
+                                        if (complex_miller)
+                                            printf("COMPLEX MILLER!\n");
+                                        if (no_Nabc_scale)
+                                            printf("No Nabc scale!\n");
                                     }
                                 }
+                            } // end of printout if
+
+                        } // end of mos_tic loop
+                    } // end of source loop
+                } // end of thick step loop
+            } // end of fpos loop
+        } // end of spos loop
+        floatimage(pixIdx) = _I;
+        if (save_wavelenimage)
+            wavelenimage(pixIdx) = Ilambda / _I;
+
+        if (refine_flag) {
+            manager_dI(pixIdx) = dI;
+            manager_dI2(pixIdx) = dI2;
+        }
+    }); // end pixIdx loop
 
-                            }  // end of mos_tic loop
-                        }      // end of source loop
-                    }          // end of thick step loop
-                }              // end of fpos loop
-            }                  // end of spos loop
-            if (Fhkl_gradient_mode)
-                return;
+    if (Fhkl_gradient_mode)
+        return;
 
-            CUDAREAL _Fdet_ave = pixel_size * _fpixel + pixel_size / 2.0;
-            CUDAREAL _Sdet_ave = pixel_size * _spixel + pixel_size / 2.0;
-            CUDAREAL _Odet_ave = 0;  // Odet;
-            // TODO maybe make this more general for thick detectors?
-
-            KOKKOS_VEC3 _pixel_pos_ave(0, 0, 0);
-            int pid_x = _pid * 3;
-            int pid_y = _pid * 3 + 1;
-            int pid_z = _pid * 3 + 2;
-
-            CUDAREAL fx = fdet_vectors(pid_x);
-            CUDAREAL fy = fdet_vectors(pid_y);
-            CUDAREAL fz = fdet_vectors(pid_z);
-
-            CUDAREAL sx = sdet_vectors(pid_x);
-            CUDAREAL sy = sdet_vectors(pid_y);
-            CUDAREAL sz = sdet_vectors(pid_z);
-
-            CUDAREAL ox = odet_vectors(pid_x);
-            CUDAREAL oy = odet_vectors(pid_y);
-            CUDAREAL oz = odet_vectors(pid_z);
-
-            CUDAREAL px = pix0_vectors(pid_x);
-            CUDAREAL py = pix0_vectors(pid_y);
-            CUDAREAL pz = pix0_vectors(pid_z);
-
-            _pixel_pos_ave[0] = _Fdet_ave * fx + _Sdet_ave * sx + _Odet_ave * ox + px;
-            _pixel_pos_ave[1] = _Fdet_ave * fy + _Sdet_ave * sy + _Odet_ave * oy + py;
-            _pixel_pos_ave[2] = _Fdet_ave * fz + _Sdet_ave * sz + _Odet_ave * oz + pz;
-
-            CUDAREAL _airpath_ave = _pixel_pos_ave.length();
-            KOKKOS_VEC3 _diffracted_ave = _pixel_pos_ave.get_unit_vector();
-            CUDAREAL _omega_pixel_ave = pixel_size * pixel_size / _airpath_ave / _airpath_ave *
-                                        close_distance / _airpath_ave;
-
-            CUDAREAL _polar = 1;
-            if (!nopolar) {
-                KOKKOS_VEC3 _incident(-source_X(0), -source_Y(0), -source_Z(0));
-                _incident.normalize();
-                // component of diffracted unit vector along _incident beam unit vector
-                CUDAREAL cos2theta = _incident.dot(_diffracted_ave);
-                CUDAREAL cos2theta_sqr = cos2theta * cos2theta;
-                CUDAREAL sin2theta_sqr = 1 - cos2theta_sqr;
-
-                CUDAREAL _psi = 0;
-                if (kahn_factor != 0.0) {
-                    // cross product to get "vertical" axis that is orthogonal to the cannonical
-                    // "polarization"
-                    KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
-                    // cross product with _incident beam to get E-vector direction
-                    KOKKOS_VEC3 E_in = _incident.cross(B_in);
-                    // get components of diffracted ray projected onto the E-B plane
-                    CUDAREAL _kEi = _diffracted_ave.dot(E_in);
-                    CUDAREAL _kBi = _diffracted_ave.dot(B_in);
-                    // compute the angle of the diffracted ray projected onto the incident E-B plane
-                    _psi = -atan2(_kBi, _kEi);
+    Kokkos::parallel_for(
+        "deriv_image_increment", Npix_to_model, KOKKOS_LAMBDA(const int& pixIdx) {
+
+        int _pid = panels_fasts_slows(pixIdx * 3);
+        int _fpixel = panels_fasts_slows(pixIdx * 3 + 1);
+        int _spixel = panels_fasts_slows(pixIdx * 3 + 2);
+
+        CUDAREAL _Fdet_ave = pixel_size * _fpixel + pixel_size / 2.0;
+        CUDAREAL _Sdet_ave = pixel_size * _spixel + pixel_size / 2.0;
+        CUDAREAL _Odet_ave = 0;  // Odet;
+        // TODO maybe make this more general for thick detectors?
+
+        KOKKOS_VEC3 _pixel_pos_ave(0, 0, 0);
+        int pid_x = _pid * 3;
+        int pid_y = _pid * 3 + 1;
+        int pid_z = _pid * 3 + 2;
+
+        CUDAREAL fx = fdet_vectors(pid_x);
+        CUDAREAL fy = fdet_vectors(pid_y);
+        CUDAREAL fz = fdet_vectors(pid_z);
+
+        CUDAREAL sx = sdet_vectors(pid_x);
+        CUDAREAL sy = sdet_vectors(pid_y);
+        CUDAREAL sz = sdet_vectors(pid_z);
+
+        CUDAREAL ox = odet_vectors(pid_x);
+        CUDAREAL oy = odet_vectors(pid_y);
+        CUDAREAL oz = odet_vectors(pid_z);
+
+        CUDAREAL px = pix0_vectors(pid_x);
+        CUDAREAL py = pix0_vectors(pid_y);
+        CUDAREAL pz = pix0_vectors(pid_z);
+
+        _pixel_pos_ave[0] = _Fdet_ave * fx + _Sdet_ave * sx + _Odet_ave * ox + px;
+        _pixel_pos_ave[1] = _Fdet_ave * fy + _Sdet_ave * sy + _Odet_ave * oy + py;
+        _pixel_pos_ave[2] = _Fdet_ave * fz + _Sdet_ave * sz + _Odet_ave * oz + pz;
+
+        CUDAREAL close_distance = close_distances(_pid);
+
+        CUDAREAL _airpath_ave_r = 1 / _pixel_pos_ave.length();
+        KOKKOS_VEC3 _diffracted_ave = _pixel_pos_ave.get_unit_vector();
+        CUDAREAL _omega_pixel_ave = pixel_size * pixel_size * _airpath_ave_r * _airpath_ave_r *
+                                    close_distance * _airpath_ave_r;
+
+        CUDAREAL _polar = 1;
+        if (!nopolar) {
+            KOKKOS_VEC3 _incident(-source_X(0), -source_Y(0), -source_Z(0));
+            _incident.normalize();
+            // component of diffracted unit vector along _incident beam unit vector
+            CUDAREAL cos2theta = _incident.dot(_diffracted_ave);
+            CUDAREAL cos2theta_sqr = cos2theta * cos2theta;
+            CUDAREAL sin2theta_sqr = 1 - cos2theta_sqr;
+
+            CUDAREAL cos2psi = 0;
+            if (kahn_factor != 0.0) {
+                // cross product to get "vertical" axis that is orthogonal to the cannonical
+                // "polarization"
+                KOKKOS_VEC3 B_in = polarization_axis.cross(_incident);
+                // cross product with _incident beam to get E-vector direction
+                KOKKOS_VEC3 E_in = _incident.cross(B_in);
+                // get components of diffracted ray projected onto the E-B plane
+                CUDAREAL _kEi = _diffracted_ave.dot(E_in);
+                CUDAREAL _kBi = _diffracted_ave.dot(B_in);
+                // compute the angle of the diffracted ray projected onto the incident E-B plane
+                // calculate cos(2 * atan2(_kBi, _kEi))
+                if (_kEi!=0) {
+                    CUDAREAL ratio = _kBi / _kEi;
+                    cos2psi = (1 - ratio*ratio) / (1 + ratio*ratio);
+                } else {
+                    cos2psi = -1;
                 }
-                // correction for polarized _incident beam
-                _polar =
-                    0.5 * (1.0 + cos2theta_sqr -
-                           kahn_factor * ::Kokkos::Experimental::cos(2 * _psi) * sin2theta_sqr);
             }
+            // correction for polarized _incident beam
+            _polar = 0.5 * (1.0 + cos2theta_sqr - kahn_factor * cos2psi * sin2theta_sqr);
+        }
 
-            CUDAREAL _om = 1;
-            if (!oversample_omega)
-                _om = _omega_pixel_ave;
-            // final scale term to being everything to photon number units
-            CUDAREAL _scale_term = _polar * _om * overall_scale;
-            floatimage(pixIdx) = _scale_term * _I;
-            if (save_wavelenimage)
-                wavelenimage(pixIdx) = Ilambda / _I;
-
-            // udpate the rotation derivative images*
-            for (int i_rot = 0; i_rot < 3; i_rot++) {
-                if (refine_Umat(i_rot)) {
-                    CUDAREAL value = _scale_term * rot_manager_dI[i_rot];
-                    CUDAREAL value2 = _scale_term * rot_manager_dI2[i_rot];
-                    int idx = i_rot * Npix_to_model + pixIdx;
-                    d_Umat_images(idx) = value;
-                    d2_Umat_images(idx) = value2;
-                }
-            }  // end rot deriv image increment
-
-            // update the ucell derivative images
-            for (int i_uc = 0; i_uc < 6; i_uc++) {
-                if (refine_Bmat(i_uc)) {
-                    CUDAREAL value = _scale_term * ucell_manager_dI[i_uc];
-                    CUDAREAL value2 = _scale_term * ucell_manager_dI2[i_uc];
-                    int idx = i_uc * Npix_to_model + pixIdx;
-                    d_Bmat_images(idx) = value;
-                    d2_Bmat_images(idx) = value2;
-                }
-            }  // end ucell deriv image increment
-
-            // update the Ncells derivative image
-            if (refine_Ncells(0)) {
-                CUDAREAL value = _scale_term * Ncells_manager_dI[0];
-                CUDAREAL value2 = _scale_term * Ncells_manager_dI2[0];
-                int idx = pixIdx;
+        CUDAREAL _om = 1;
+        if (!oversample_omega)
+            _om = _omega_pixel_ave;
+        // final scale term to being everything to photon number units
+        CUDAREAL _scale_term = _polar * _om * overall_scale;
+        floatimage(pixIdx) *= _scale_term;
+
+        auto& dI = manager_dI(pixIdx);
+        auto& dI2 = manager_dI2(pixIdx);
+
+        // udpate the rotation derivative images*
+        for (int i_rot = 0; i_rot < 3; i_rot++) {
+            if (refine_flag & (REFINE_UMAT1 << i_rot)) {
+                CUDAREAL value = _scale_term * dI.rot[i_rot];
+                CUDAREAL value2 = _scale_term * dI2.rot[i_rot];
+                int idx = i_rot * Npix_to_model + pixIdx;
+                d_Umat_images(idx) = value;
+                d2_Umat_images(idx) = value2;
+            }
+        }  // end rot deriv image increment
+
+        // update the ucell derivative images
+        for (int i_uc = 0; i_uc < 6; i_uc++) {
+            if (refine_flag & (REFINE_BMAT1 << i_uc)) {
+                CUDAREAL value = _scale_term * dI.ucell[i_uc];
+                CUDAREAL value2 = _scale_term * dI2.ucell[i_uc];
+                int idx = i_uc * Npix_to_model + pixIdx;
+                d_Bmat_images(idx) = value;
+                d2_Bmat_images(idx) = value2;
+            }
+        }  // end ucell deriv image increment
+
+        // update the Ncells derivative image
+        if (refine_flag & REFINE_NCELLS1) {
+            CUDAREAL value = _scale_term * dI.Ncells[0];
+            CUDAREAL value2 = _scale_term * dI2.Ncells[0];
+            int idx = pixIdx;
+            d_Ncells_images(idx) = value;
+            d2_Ncells_images(idx) = value2;
+
+            if (!isotropic_ncells) {
+                value = _scale_term * dI.Ncells[1];
+                value2 = _scale_term * dI2.Ncells[1];
+                idx = Npix_to_model + pixIdx;
                 d_Ncells_images(idx) = value;
                 d2_Ncells_images(idx) = value2;
 
-                if (!isotropic_ncells) {
-                    value = _scale_term * Ncells_manager_dI[1];
-                    value2 = _scale_term * Ncells_manager_dI2[1];
-                    idx = Npix_to_model + pixIdx;
-                    d_Ncells_images(idx) = value;
-                    d2_Ncells_images(idx) = value2;
-
-                    value = _scale_term * Ncells_manager_dI[2];
-                    value2 = _scale_term * Ncells_manager_dI2[2];
-                    idx = Npix_to_model * 2 + pixIdx;
-                    d_Ncells_images(idx) = value;
-                    d2_Ncells_images(idx) = value2;
-                }
-            }  // end Ncells deriv image increment
-            if (refine_Ncells_def) {
-                for (int i_nc = 3; i_nc < 6; i_nc++) {
-                    CUDAREAL value = _scale_term * Ncells_manager_dI[i_nc];
-                    CUDAREAL value2 = _scale_term * Ncells_manager_dI2[i_nc];
-                    int idx = i_nc * Npix_to_model + pixIdx;
-                    d_Ncells_images(idx) = value;
-                    d2_Ncells_images(idx) = value2;
-                }
+                value = _scale_term * dI.Ncells[2];
+                value2 = _scale_term * dI2.Ncells[2];
+                idx = Npix_to_model * 2 + pixIdx;
+                d_Ncells_images(idx) = value;
+                d2_Ncells_images(idx) = value2;
+            }
+        }  // end Ncells deriv image increment
+        if (refine_flag & REFINE_NCELLS_DEF) {
+            for (int i_nc = 3; i_nc < 6; i_nc++) {
+                CUDAREAL value = _scale_term * dI.Ncells[i_nc];
+                CUDAREAL value2 = _scale_term * dI2.Ncells[i_nc];
+                int idx = i_nc * Npix_to_model + pixIdx;
+                d_Ncells_images(idx) = value;
+                d2_Ncells_images(idx) = value2;
             }
+        }
 
-            // update Fcell derivative image
-            if (refine_fcell) {
-                CUDAREAL value = _scale_term * fcell_manager_dI;
-                CUDAREAL value2 = _scale_term * fcell_manager_dI2;
-                d_fcell_images(pixIdx) = value;
-                d2_fcell_images(pixIdx) = value2;
-            }  // end Fcell deriv image increment
-
-            if (refine_fp_fdp) {
-                // c derivative
-                CUDAREAL value = _scale_term * fp_fdp_manager_dI[0];
-                d_fp_fdp_images(pixIdx) = value;
-                // d derivative
-                value = _scale_term * fp_fdp_manager_dI[1];
-                d_fp_fdp_images(Npix_to_model + pixIdx) = value;
+        // update Fcell derivative image
+        if (refine_flag & REFINE_FCELL) {
+            CUDAREAL value = _scale_term * dI.fcell;
+            CUDAREAL value2 = _scale_term * dI2.fcell;
+            d_fcell_images(pixIdx) = value;
+            d2_fcell_images(pixIdx) = value2;
+        }  // end Fcell deriv image increment
+
+        if (refine_flag & REFINE_FP_FDP) {
+            // c derivative
+            CUDAREAL value = _scale_term * dI.fp_fdp[0];
+            d_fp_fdp_images(pixIdx) = value;
+            // d derivative
+            value = _scale_term * dI.fp_fdp[1];
+            d_fp_fdp_images(Npix_to_model + pixIdx) = value;
+        }
+        if (refine_flag & REFINE_DIFFUSE) {
+            for (int i_gam = 0; i_gam < 3; i_gam++) {
+                CUDAREAL val = dI.diffuse[i_gam] * _scale_term;
+                int img_idx = Npix_to_model * i_gam + pixIdx;
+                d_diffuse_gamma_images(img_idx) = val;
             }
-            if (refine_diffuse) {
-                for (int i_gam = 0; i_gam < 3; i_gam++) {
-                    CUDAREAL val = dI_diffuse[i_gam] * _scale_term;
-                    int img_idx = Npix_to_model * i_gam + pixIdx;
-                    d_diffuse_gamma_images(img_idx) = val;
-                }
-                for (int i_sig = 0; i_sig < 3; i_sig++) {
-                    CUDAREAL val = dI_diffuse[i_sig + 3] * _scale_term;
-                    int img_idx = Npix_to_model * i_sig + pixIdx;
-                    d_diffuse_sigma_images(img_idx) = val;
-                }
+            for (int i_sig = 0; i_sig < 3; i_sig++) {
+                CUDAREAL val = dI.diffuse[i_sig + 3] * _scale_term;
+                int img_idx = Npix_to_model * i_sig + pixIdx;
+                d_diffuse_sigma_images(img_idx) = val;
             }
+        }
 
-            // update eta derivative image
-            if (refine_eta) {
-                for (int i_eta = 0; i_eta < 3; i_eta++) {
-                    if (i_eta > 0 && !aniso_eta)
-                        continue;
-                    int idx = pixIdx + Npix_to_model * i_eta;
-                    CUDAREAL value = _scale_term * eta_manager_dI[i_eta];
-                    CUDAREAL value2 = _scale_term * eta_manager_dI2[i_eta];
-                    d_eta_images(idx) = value;
-                    d2_eta_images(idx) = value2;
-                }
-            }  // end eta deriv image increment
-
-            // update the lambda derivative images
-            for (int i_lam = 0; i_lam < 2; i_lam++) {
-                if (refine_lambda(i_lam)) {
-                    CUDAREAL value = _scale_term * lambda_manager_dI[i_lam];
-                    CUDAREAL value2 = _scale_term * lambda_manager_dI2[i_lam];
-                    int idx = i_lam * Npix_to_model + pixIdx;
-                    d_lambda_images(idx) = value;
-                    // d2_lambda_images(idx) = value2;
-                }
-            }  // end lambda deriv image increment
-
-            for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
-                if (refine_panel_rot(i_pan_rot)) {
-                    CUDAREAL value = _scale_term * pan_rot_manager_dI[i_pan_rot];
-                    CUDAREAL value2 = _scale_term * pan_rot_manager_dI2[i_pan_rot];
-                    int idx = i_pan_rot * Npix_to_model + pixIdx;
-                    d_panel_rot_images(idx) = value;
-                    // d2_panel_rot_images(idx) = value2;
-                }
-            }  // end panel rot deriv image increment
-
-            for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
-                if (refine_panel_origin(i_pan_orig)) {
-                    CUDAREAL value = _scale_term * pan_orig_manager_dI[i_pan_orig];
-                    CUDAREAL value2 = _scale_term * pan_orig_manager_dI2[i_pan_orig];
-                    int idx = i_pan_orig * Npix_to_model + pixIdx;
-                    d_panel_orig_images(idx) = value;
-                    // d2_panel_orig_images(idx) = value2;
-                }
-            }  // end panel orig deriv image increment
-        });    // end pixIdx loop
+        // update eta derivative image
+        if (refine_flag & REFINE_ETA) {
+            for (int i_eta = 0; i_eta < 3; i_eta++) {
+                if (i_eta > 0 && !aniso_eta)
+                    continue;
+                int idx = pixIdx + Npix_to_model * i_eta;
+                CUDAREAL value = _scale_term * dI.eta[i_eta];
+                CUDAREAL value2 = _scale_term * dI2.eta[i_eta];
+                d_eta_images(idx) = value;
+                d2_eta_images(idx) = value2;
+            }
+        }  // end eta deriv image increment
+
+        // update the lambda derivative images
+        for (int i_lam = 0; i_lam < 2; i_lam++) {
+            if (refine_flag & (REFINE_LAMBDA1 << i_lam)) {
+                CUDAREAL value = _scale_term * dI.lambda[i_lam];
+                CUDAREAL value2 = _scale_term * dI2.lambda[i_lam];
+                int idx = i_lam * Npix_to_model + pixIdx;
+                d_lambda_images(idx) = value;
+                // d2_lambda_images(idx) = value2;
+            }
+        }  // end lambda deriv image increment
+
+        for (int i_pan_rot = 0; i_pan_rot < 3; i_pan_rot++) {
+            if (refine_flag & (REFINE_PANEL_ROT1 << i_pan_rot)) {
+                CUDAREAL value = _scale_term * dI.pan_rot[i_pan_rot];
+                CUDAREAL value2 = _scale_term * dI2.pan_rot[i_pan_rot];
+                int idx = i_pan_rot * Npix_to_model + pixIdx;
+                d_panel_rot_images(idx) = value;
+                // d2_panel_rot_images(idx) = value2;
+            }
+        }  // end panel rot deriv image increment
+
+        for (int i_pan_orig = 0; i_pan_orig < 3; i_pan_orig++) {
+            if (refine_flag & (REFINE_PANEL_ORIGIN1 << i_pan_orig)) {
+                CUDAREAL value = _scale_term * dI.pan_orig[i_pan_orig];
+                CUDAREAL value2 = _scale_term * dI2.pan_orig[i_pan_orig];
+                int idx = i_pan_orig * Npix_to_model + pixIdx;
+                d_panel_orig_images(idx) = value;
+                // d2_panel_orig_images(idx) = value2;
+            }
+        }  // end panel orig deriv image increment
+    });    // end pixIdx loop
 
 }  // END of GPU kernel
+
+template void
+kokkos_sum_over_steps<
+    false, // printout,
+    false, // complex_miller,
+    false, // compute_curvatures,
+    REFINE_FCELL,  // refine_flag,
+    false, // use_diffuse,
+    false, // save_wavelenimage
+    false, // Fhkl_gradient_mode,
+    false, // Fhkl_errors_mode,
+    false, // using_trusted_mask,
+    true, // Fhkl_channels_empty,
+    false> // Fhkl_have_scale_factors
+    (
+    int Npix_to_model,
+    vector_uint_t panels_fasts_slows,
+    vector_cudareal_t floatimage,
+    vector_cudareal_t wavelenimage,
+    vector_cudareal_t d_Umat_images,
+    vector_cudareal_t d2_Umat_images,
+    vector_cudareal_t d_Bmat_images,
+    vector_cudareal_t d2_Bmat_images,
+    vector_cudareal_t d_Ncells_images,
+    vector_cudareal_t d2_Ncells_images,
+    vector_cudareal_t d_fcell_images,
+    vector_cudareal_t d2_fcell_images,
+    vector_cudareal_t d_eta_images,
+    vector_cudareal_t d2_eta_images,
+    vector_cudareal_t d_lambda_images,
+    vector_cudareal_t d2_lambda_images,
+    vector_cudareal_t d_panel_rot_images,
+    vector_cudareal_t d2_panel_rot_images,
+    vector_cudareal_t d_panel_orig_images,
+    vector_cudareal_t d2_panel_orig_images,
+    vector_cudareal_t d_fp_fdp_images,
+    vector_manager_t manager_dI,
+    vector_manager_t manager_dI2,
+    const int Nsteps,
+    int printout_fpixel,
+    int printout_spixel,
+    /*bool printout,*/
+    CUDAREAL default_F,
+    int oversample,
+    bool oversample_omega,
+    CUDAREAL subpixel_size,
+    CUDAREAL pixel_size,
+    CUDAREAL detector_thickstep,
+    CUDAREAL detector_thick,
+    const vector_cudareal_t close_distances,
+    CUDAREAL detector_attnlen,
+    int detector_thicksteps,
+    int sources,
+    int phisteps,
+    int mosaic_domains,
+    bool use_lambda_coefficients,
+    CUDAREAL lambda0,
+    CUDAREAL lambda1,
+    KOKKOS_MAT3 eig_U,
+    KOKKOS_MAT3 eig_O,
+    KOKKOS_MAT3 eig_B,
+    KOKKOS_MAT3 RXYZ,
+    vector_vec3_t dF_vecs,
+    vector_vec3_t dS_vecs,
+    const vector_mat3_t UMATS_RXYZ,
+    vector_mat3_t UMATS_RXYZ_prime,
+    vector_mat3_t UMATS_RXYZ_dbl_prime,
+    vector_mat3_t RotMats,
+    vector_mat3_t dRotMats,
+    vector_mat3_t d2RotMats,
+    vector_mat3_t UMATS,
+    vector_mat3_t dB_mats,
+    vector_mat3_t dB2_mats,
+    vector_mat3_t Amatrices,
+    const vector_cudareal_t source_X,
+    const vector_cudareal_t source_Y,
+    const vector_cudareal_t source_Z,
+    const vector_cudareal_t source_lambda,
+    const vector_cudareal_t source_I,
+    CUDAREAL kahn_factor,
+    CUDAREAL Na,
+    CUDAREAL Nb,
+    CUDAREAL Nc,
+    CUDAREAL Nd,
+    CUDAREAL Ne,
+    CUDAREAL Nf,
+    CUDAREAL phi0,
+    CUDAREAL phistep,
+    KOKKOS_VEC3 spindle_vec,
+    KOKKOS_VEC3 polarization_axis,
+    int h_range,
+    int k_range,
+    int l_range,
+    int h_max,
+    int h_min,
+    int k_max,
+    int k_min,
+    int l_max,
+    int l_min,
+    CUDAREAL dmin,
+    CUDAREAL fudge,
+    /*bool complex_miller,*/
+    int verbose,
+    bool only_save_omega_kahn,
+    bool isotropic_ncells,
+    /*bool compute_curvatures,*/
+    const vector_cudareal_t FhklLinear,
+    const vector_cudareal_t Fhkl2Linear,
+    /*const uint32_t refine_flag,*/
+    // vector_bool_t refine_Bmat,
+    // vector_bool_t refine_Ncells,
+    // bool refine_Ncells_def,
+    // vector_bool_t refine_panel_origin,
+    // vector_bool_t refine_panel_rot,
+    // bool refine_fcell,
+    // vector_bool_t refine_lambda,
+    // bool refine_eta,
+    // vector_bool_t refine_Umat,
+    const vector_cudareal_t fdet_vectors,
+    const vector_cudareal_t sdet_vectors,
+    const vector_cudareal_t odet_vectors,
+    const vector_cudareal_t pix0_vectors,
+    bool nopolar,
+    bool point_pixel,
+    CUDAREAL fluence,
+    CUDAREAL r_e_sqr,
+    CUDAREAL spot_scale,
+    int Npanels,
+    bool aniso_eta,
+    bool no_Nabc_scale,
+    const vector_cudareal_t fpfdp,
+    const vector_cudareal_t fpfdp_derivs,
+    const vector_cudareal_t atom_data,
+    int num_atoms,
+    // bool refine_fp_fdp,
+    const vector_int_t nominal_hkl,
+    bool use_nominal_hkl,
+    KOKKOS_MAT3 anisoU,
+    KOKKOS_MAT3 anisoG,
+    KOKKOS_MAT3 rotate_principal_axes,
+    /*bool use_diffuse,*/
+    vector_cudareal_t d_diffuse_gamma_images,
+    vector_cudareal_t d_diffuse_sigma_images,
+    // bool refine_diffuse,
+    bool gamma_miller_units,
+    // bool refine_Icell,
+    /*bool save_wavelenimage,*/
+    int laue_group_num,
+    int stencil_size,
+    /*bool Fhkl_gradient_mode,*/
+    /*bool Fhkl_errors_mode,*/
+    /*bool using_trusted_mask,*/
+    /*bool Fhkl_channels_empty,*/
+    /*bool Fhkl_have_scale_factors,*/
+    int Num_ASU,
+    const vector_cudareal_t data_residual,
+    const vector_cudareal_t data_variance,
+    const vector_int_t data_freq,
+    const vector_bool_t data_trusted,
+    const vector_int_t FhklLinear_ASUid,
+    const vector_int_t Fhkl_channels,
+    const vector_cudareal_t Fhkl_scale,
+    vector_cudareal_t Fhkl_scale_deriv);
diff --git a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
index 3c4a117693..3586183087 100644
--- a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
+++ b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
@@ -26,6 +26,8 @@ void kokkos_sum_over_steps(
     vector_cudareal_t d_panel_orig_images,
     vector_cudareal_t d2_panel_orig_images,
     vector_cudareal_t d_fp_fdp_images,
+    vector_manager_t manager_dI,
+    vector_manager_t manager_dI2,
     const int Nsteps,
     int printout_fpixel,
     int printout_spixel,
@@ -96,15 +98,16 @@ void kokkos_sum_over_steps(
     bool compute_curvatures,
     const vector_cudareal_t FhklLinear,
     const vector_cudareal_t Fhkl2Linear,
-    vector_bool_t refine_Bmat,
-    vector_bool_t refine_Ncells,
-    bool refine_Ncells_def,
-    vector_bool_t refine_panel_origin,
-    vector_bool_t refine_panel_rot,
-    bool refine_fcell,
-    vector_bool_t refine_lambda,
-    bool refine_eta,
-    vector_bool_t refine_Umat,
+    const uint32_t refine_flag,
+    // vector_bool_t refine_Bmat,
+    // vector_bool_t refine_Ncells,
+    // bool refine_Ncells_def,
+    // vector_bool_t refine_panel_origin,
+    // vector_bool_t refine_panel_rot,
+    // bool refine_fcell,
+    // vector_bool_t refine_lambda,
+    // bool refine_eta,
+    // vector_bool_t refine_Umat,
     const vector_cudareal_t fdet_vectors,
     const vector_cudareal_t sdet_vectors,
     const vector_cudareal_t odet_vectors,
@@ -121,7 +124,7 @@ void kokkos_sum_over_steps(
     const vector_cudareal_t fpfdp_derivs,
     const vector_cudareal_t atom_data,
     int num_atoms,
-    bool refine_fp_fdp,
+    // bool refine_fp_fdp,
     const vector_int_t nominal_hkl,
     bool use_nominal_hkl,
     KOKKOS_MAT3 anisoU,
@@ -130,9 +133,9 @@ void kokkos_sum_over_steps(
     bool use_diffuse,
     vector_cudareal_t d_diffuse_gamma_images,
     vector_cudareal_t d_diffuse_sigma_images,
-    bool refine_diffuse,
+    // bool refine_diffuse,
     bool gamma_miller_units,
-    bool refine_Icell,
+    // bool refine_Icell,
     bool save_wavelenimage,
     int laue_group_num, int stencil_size,
     bool Fhkl_gradient_mode,
@@ -146,10 +149,170 @@ void kokkos_sum_over_steps(
     const vector_int_t data_freq,
     const vector_bool_t data_trusted,
     const vector_int_t FhklLinear_ASUid,
-    const vector_cudareal_t Fhkl_channels,
+    const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
     vector_cudareal_t Fhkl_scale_deriv
-    );
-
+);
 
+template <
+    bool printout,
+    bool complex_miller,
+    bool compute_curvatures,
+    uint32_t refine_fcell,
+    bool use_diffuse,
+    bool save_wavelenimage,
+    bool Fhkl_gradient_mode,
+    bool Fhkl_errors_mode,
+    bool using_trusted_mask,
+    bool Fhkl_channels_empty,
+    bool Fhkl_have_scale_factors>
+void kokkos_sum_over_steps(
+    int Npix_to_model,
+    vector_uint_t panels_fasts_slows,
+    vector_cudareal_t floatimage,
+    vector_cudareal_t wavelenimage,
+    vector_cudareal_t d_Umat_images,
+    vector_cudareal_t d2_Umat_images,
+    vector_cudareal_t d_Bmat_images,
+    vector_cudareal_t d2_Bmat_images,
+    vector_cudareal_t d_Ncells_images,
+    vector_cudareal_t d2_Ncells_images,
+    vector_cudareal_t d_fcell_images,
+    vector_cudareal_t d2_fcell_images,
+    vector_cudareal_t d_eta_images,
+    vector_cudareal_t d2_eta_images,
+    vector_cudareal_t d_lambda_images,
+    vector_cudareal_t d2_lambda_images,
+    vector_cudareal_t d_panel_rot_images,
+    vector_cudareal_t d2_panel_rot_images,
+    vector_cudareal_t d_panel_orig_images,
+    vector_cudareal_t d2_panel_orig_images,
+    vector_cudareal_t d_fp_fdp_images,
+    vector_manager_t manager_dI,
+    vector_manager_t manager_dI2,
+    const int Nsteps,
+    int printout_fpixel,
+    int printout_spixel,
+    /*bool printout,*/
+    CUDAREAL default_F,
+    int oversample,
+    bool oversample_omega,
+    CUDAREAL subpixel_size,
+    CUDAREAL pixel_size,
+    CUDAREAL detector_thickstep,
+    CUDAREAL detector_thick,
+    const vector_cudareal_t close_distances,
+    CUDAREAL detector_attnlen,
+    int detector_thicksteps,
+    int sources,
+    int phisteps,
+    int mosaic_domains,
+    bool use_lambda_coefficients,
+    CUDAREAL lambda0,
+    CUDAREAL lambda1,
+    KOKKOS_MAT3 eig_U,
+    KOKKOS_MAT3 eig_O,
+    KOKKOS_MAT3 eig_B,
+    KOKKOS_MAT3 RXYZ,
+    vector_vec3_t dF_vecs,
+    vector_vec3_t dS_vecs,
+    const vector_mat3_t UMATS_RXYZ,
+    vector_mat3_t UMATS_RXYZ_prime,
+    vector_mat3_t UMATS_RXYZ_dbl_prime,
+    vector_mat3_t RotMats,
+    vector_mat3_t dRotMats,
+    vector_mat3_t d2RotMats,
+    vector_mat3_t UMATS,
+    vector_mat3_t dB_mats,
+    vector_mat3_t dB2_mats,
+    vector_mat3_t Amatrices,
+    const vector_cudareal_t source_X,
+    const vector_cudareal_t source_Y,
+    const vector_cudareal_t source_Z,
+    const vector_cudareal_t source_lambda,
+    const vector_cudareal_t source_I,
+    CUDAREAL kahn_factor,
+    CUDAREAL Na,
+    CUDAREAL Nb,
+    CUDAREAL Nc,
+    CUDAREAL Nd,
+    CUDAREAL Ne,
+    CUDAREAL Nf,
+    CUDAREAL phi0,
+    CUDAREAL phistep,
+    KOKKOS_VEC3 spindle_vec,
+    KOKKOS_VEC3 polarization_axis,
+    int h_range,
+    int k_range,
+    int l_range,
+    int h_max,
+    int h_min,
+    int k_max,
+    int k_min,
+    int l_max,
+    int l_min,
+    CUDAREAL dmin,
+    CUDAREAL fudge,
+    /*bool complex_miller,*/
+    int verbose,
+    bool only_save_omega_kahn,
+    bool isotropic_ncells,
+    /*bool compute_curvatures,*/
+    const vector_cudareal_t FhklLinear,
+    const vector_cudareal_t Fhkl2Linear,
+    /*const uint32_t refine_flag,*/
+    // vector_bool_t refine_Bmat,
+    // vector_bool_t refine_Ncells,
+    // bool refine_Ncells_def,
+    // vector_bool_t refine_panel_origin,
+    // vector_bool_t refine_panel_rot,
+    // bool refine_fcell,
+    // vector_bool_t refine_lambda,
+    // bool refine_eta,
+    // vector_bool_t refine_Umat,
+    const vector_cudareal_t fdet_vectors,
+    const vector_cudareal_t sdet_vectors,
+    const vector_cudareal_t odet_vectors,
+    const vector_cudareal_t pix0_vectors,
+    bool nopolar,
+    bool point_pixel,
+    CUDAREAL fluence,
+    CUDAREAL r_e_sqr,
+    CUDAREAL spot_scale,
+    int Npanels,
+    bool aniso_eta,
+    bool no_Nabc_scale,
+    const vector_cudareal_t fpfdp,
+    const vector_cudareal_t fpfdp_derivs,
+    const vector_cudareal_t atom_data,
+    int num_atoms,
+    // bool refine_fp_fdp,
+    const vector_int_t nominal_hkl,
+    bool use_nominal_hkl,
+    KOKKOS_MAT3 anisoU,
+    KOKKOS_MAT3 anisoG,
+    KOKKOS_MAT3 rotate_principal_axes,
+    /*bool use_diffuse,*/
+    vector_cudareal_t d_diffuse_gamma_images,
+    vector_cudareal_t d_diffuse_sigma_images,
+    // bool refine_diffuse,
+    bool gamma_miller_units,
+    // bool refine_Icell,
+    /*bool save_wavelenimage,*/
+    int laue_group_num, int stencil_size,
+    /*bool Fhkl_gradient_mode,*/
+    /*bool Fhkl_errors_mode,*/
+    /*bool using_trusted_mask,*/
+    /*bool Fhkl_channels_empty,*/
+    /*bool Fhkl_have_scale_factors,*/
+    int Num_ASU,
+    const vector_cudareal_t data_residual,
+    const vector_cudareal_t data_variance,
+    const vector_int_t data_freq,
+    const vector_bool_t data_trusted,
+    const vector_int_t FhklLinear_ASUid,
+    const vector_int_t Fhkl_channels,
+    const vector_cudareal_t Fhkl_scale,
+    vector_cudareal_t Fhkl_scale_deriv
+);
 #endif
diff --git a/simtbx/diffBragg/src/diffBragg_refine_flag.h b/simtbx/diffBragg/src/diffBragg_refine_flag.h
new file mode 100644
index 0000000000..64b250ebf5
--- /dev/null
+++ b/simtbx/diffBragg/src/diffBragg_refine_flag.h
@@ -0,0 +1,67 @@
+#ifndef SIMTBX_DIFFBRAGG_REFINE_FLAG
+#define SIMTBX_DIFFBRAGG_REFINE_FLAG
+/*
+enum refine_id {
+    ROTX_ID = (1u << 0),
+    ROTY_ID = (1u << 1),
+    ROTZ_ID = (1u << 2),
+    UCELL_A_ID = (1u << 3),
+    UCELL_B_ID = (1u << 4),
+    UCELL_C_ID = (1u << 5),
+    UCELL_ALPHA_ID = (1u << 6),
+    UCELL_BETA_ID = (1u << 7),
+    UCELL_GAMMA_ID = (1u << 8),
+    NCELLS_ID = (1u << 9),
+    PANELZ_ID = (1u << 10),
+    FCELL_ID = (1u << 11),
+    LAMBDA_OFFSET_ID = (1u << 12),
+    LAMBDA_SCALE_ID = (1u << 13),
+    PANEL_ROTO_ID = (1u << 14),
+    PANELX_ID = (1u << 15),
+    PANELY_ID = (1u << 16),
+    PANEL_ROTF_ID = (1u << 17),
+    PANEL_ROTS_ID = (1u << 18),
+    ETA_ID = (1u << 19),
+    NCELLS_OFFDIAG_ID = (1u << 21),
+    F_PRIME_F_DPRIME_ID = (1u << 22),
+    DIFFUSE_ID = (1u << 23),
+};
+*/
+
+enum refine_flag : uint32_t {
+    REFINE_DIFFUSE = (1u << 0),
+    REFINE_FP_FDP = (1u << 1),
+    REFINE_UMAT1 = (1u << 2),
+    REFINE_UMAT2 = (1u << 3),
+    REFINE_UMAT3 = (1u << 4),
+    REFINE_BMAT1 = (1u << 5),
+    REFINE_BMAT2 = (1u << 6),
+    REFINE_BMAT3 = (1u << 7),
+    REFINE_BMAT4 = (1u << 8),
+    REFINE_BMAT5 = (1u << 9),
+    REFINE_BMAT6 = (1u << 10),
+    REFINE_NCELLS1 = (1u << 11),
+    REFINE_NCELLS2 = (1u << 12),
+    REFINE_NCELLS3 = (1u << 13),
+    REFINE_NCELLS_DEF = (1u << 14),
+    REFINE_PANEL_ORIGIN1 = (1u << 15),
+    REFINE_PANEL_ORIGIN2 = (1u << 16),
+    REFINE_PANEL_ORIGIN3 = (1u << 17),
+    REFINE_PANEL_ROT1 = (1u << 18),
+    REFINE_PANEL_ROT2 = (1u << 19),
+    REFINE_PANEL_ROT3 = (1u << 20),
+    REFINE_FCELL = (1u << 21),
+    REFINE_ETA = (1u << 22),
+    REFINE_LAMBDA1 = (1u << 23),
+    REFINE_LAMBDA2 = (1u << 24),
+    REFINE_ICELL = (1u << 25)
+};
+
+constexpr unsigned int REFINE_BMAT = REFINE_BMAT1 | REFINE_BMAT2 | REFINE_BMAT3 | REFINE_BMAT4 | REFINE_BMAT5 | REFINE_BMAT6;
+constexpr unsigned int REFINE_UMAT = REFINE_UMAT1 | REFINE_UMAT2 | REFINE_UMAT3;
+constexpr unsigned int REFINE_NCELLS = REFINE_NCELLS1 | REFINE_NCELLS2 | REFINE_NCELLS3;
+constexpr unsigned int REFINE_PANEL_ORIGIN = REFINE_PANEL_ORIGIN1 | REFINE_PANEL_ORIGIN2 | REFINE_PANEL_ORIGIN3;
+constexpr unsigned int REFINE_PANEL_ROT = REFINE_PANEL_ROT1 | REFINE_PANEL_ROT2 | REFINE_PANEL_ROT3;
+constexpr unsigned int REFINE_LAMBDA = REFINE_LAMBDA1 | REFINE_LAMBDA2;
+
+#endif
diff --git a/simtbx/diffBragg/src/diffuse_util.h b/simtbx/diffBragg/src/diffuse_util.h
index 881a4902a9..d556f885d9 100644
--- a/simtbx/diffBragg/src/diffuse_util.h
+++ b/simtbx/diffBragg/src/diffuse_util.h
@@ -3,21 +3,22 @@
 
 #include <simtbx/diffBragg/src/util.h>
 
-#define CUDA_COMPILE (defined(DIFFBRAGG_HAVE_CUDA) && defined(__CUDACC__))
+#if defined(DIFFBRAGG_HAVE_CUDA) && defined(__CUDACC__)
+#define CUDA_COMPILE
+#endif
 
 #define REAL double
 
-#if CUDA_COMPILE
+#ifdef CUDA_COMPILE
 __device__ __host__
 #endif
-#if CUDA_COMPILE || not defined(DIFFBRAGG_HAVE_CUDA)
+
+#if defined(CUDA_COMPILE) || not defined(DIFFBRAGG_HAVE_CUDA)
 int gen_laue_mats(int laue_group_num, MAT3 *lmats, MAT3 rpa) {
-  if (laue_group_num < 1 || laue_group_num >14 ){
-    printf("Laue group number not in range 1-14; exiting\n");
-    exit(1);
-  }
+  assert(laue_group_num>0);
+  assert(laue_group_num<15);
 
-  int num_mats;
+  int num_mats = 0;
 
   const double one_over_root2 = 1./sqrt(2.);
 
@@ -561,10 +562,10 @@ int gen_laue_mats(int laue_group_num, MAT3 *lmats, MAT3 rpa) {
 int gen_laue_mats(int laue_group_num, MAT3 *lmats, MAT3 rpa);
 #endif
 
-#if CUDA_COMPILE
+#ifdef CUDA_COMPILE
 __device__ __host__
 #endif
-#if CUDA_COMPILE || not defined(DIFFBRAGG_HAVE_CUDA)
+#if defined(CUDA_COMPILE) || not defined(DIFFBRAGG_HAVE_CUDA)
 void calc_diffuse_at_hkl(VEC3 H_vec, VEC3 H0, VEC3 dHH, VEC3 Hmin, VEC3 Hmax, VEC3 Hrange, MAT3 Ainv, const REAL *FhklLinear, int num_laue_mats, MAT3 *laue_mats, MAT3 anisoG_local, MAT3 anisoU_local, MAT3 *dG_dgam, bool refine_diffuse, REAL *I0, REAL *step_diffuse_param){
   REAL four_mpi_sq = 4.*M_PI*M_PI;
   // loop over laue matrices
diff --git a/simtbx/diffBragg/src/diffuse_util_kokkos.h b/simtbx/diffBragg/src/diffuse_util_kokkos.h
index 33edd23b94..397db20f73 100644
--- a/simtbx/diffBragg/src/diffuse_util_kokkos.h
+++ b/simtbx/diffBragg/src/diffuse_util_kokkos.h
@@ -3,14 +3,12 @@
 
 #include <simtbx/diffBragg/src/util_kokkos.h>
 
-KOKKOS_FUNCTION
-int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
+KOKKOS_INLINE_FUNCTION
+int gen_laue_mats(int laue_group_num, vector_mat3_t lmats, KOKKOS_MAT3 rpa) {
 
-  if (laue_group_num < 1 || laue_group_num >14 ){
-    printf("Laue group number not in range 1-14; exiting\n");
-    exit(1);
-  }
-  int num_mats;
+  assert(laue_group_num>0);
+  assert(laue_group_num<15);
+  int num_mats = 0;
 
   const double one_over_root2 = 1./sqrt(2.);
 
@@ -18,7 +16,7 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P -1
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
@@ -28,12 +26,12 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 1 1 2/m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -x,-y,z
-    lmats[1] << -1, 0, 0,
+    lmats(1) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
@@ -43,12 +41,12 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 1 2/m 1
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -x,y,-z
-    lmats[1] << -1, 0, 0,
+    lmats(1) << -1, 0, 0,
                  0, 1, 0,
                  0, 0,-1;
 
@@ -58,12 +56,12 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 2/m 1 1
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // x,-y,-z
-    lmats[1] <<  1, 0, 0,
+    lmats(1) <<  1, 0, 0,
                  0,-1, 0,
                  0, 0,-1;
 
@@ -73,22 +71,22 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P m m m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // x,-y,-z
-    lmats[1] <<  1, 0, 0,
+    lmats(1) <<  1, 0, 0,
                  0,-1, 0,
                  0, 0,-1;
 
               // -x,y,-z
-    lmats[2] << -1, 0, 0,
+    lmats(2) << -1, 0, 0,
                  0, 1, 0,
                  0, 0,-1;
 
               // -x,-y,z
-    lmats[3] << -1, 0, 0,
+    lmats(3) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
@@ -98,22 +96,22 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 4/m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -y,x,z
-    lmats[1] <<  0,-1, 0,
+    lmats(1) <<  0,-1, 0,
                  1, 0, 0,
                  0, 0, 1;
 
               // y,-x,z
-    lmats[2] <<  0, 1, 0,
+    lmats(2) <<  0, 1, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // -x,-y,z
-    lmats[3] << -1, 0, 0,
+    lmats(3) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
@@ -123,42 +121,42 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 4/m m m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -y,x,z
-    lmats[1] <<  0,-1, 0,
+    lmats(1) <<  0,-1, 0,
                  1, 0, 0,
                  0, 0, 1;
 
               // y,-x,z
-    lmats[2] <<  0, 1, 0,
+    lmats(2) <<  0, 1, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // x,-y,-z
-    lmats[3] <<  1, 0, 0,
+    lmats(3) <<  1, 0, 0,
                  0,-1, 0,
                  0, 0,-1;
 
               // -x,y,-z
-    lmats[4] << -1, 0, 0,
+    lmats(4) << -1, 0, 0,
                  0, 1, 0,
                  0, 0,-1;
 
               // -x,-y,z
-    lmats[5] << -1, 0, 0,
+    lmats(5) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
               // y,x,-z
-    lmats[6] <<  0, 1, 0,
+    lmats(6) <<  0, 1, 0,
                  1, 0, 0,
                  0, 0,-1;
 
               // -y,-x,-z
-    lmats[7] <<  0,-1, 0,
+    lmats(7) <<  0,-1, 0,
                 -1, 0, 0,
                  0, 0,-1;
 
@@ -168,17 +166,17 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P -3
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -y,x-y,z
-    lmats[1] <<  0,-1, 0,
+    lmats(1) <<  0,-1, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0, 1;
 
               // -x+y,-x,z
-    lmats[2] << -one_over_root2, one_over_root2, 0,
+    lmats(2) << -one_over_root2, one_over_root2, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
@@ -188,32 +186,32 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P -3 m 1
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -y,x-y,z
-    lmats[1] <<  0,-1, 0,
+    lmats(1) <<  0,-1, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0, 1;
 
               // -x+y,-x,z
-    lmats[2] << -one_over_root2, one_over_root2, 0,
+    lmats(2) << -one_over_root2, one_over_root2, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // x-y,-y,-z
-    lmats[3] <<  one_over_root2,-one_over_root2, 0,
+    lmats(3) <<  one_over_root2,-one_over_root2, 0,
                  0,-1, 0,
                  0, 0,-1;
 
               // -x,-x+y,-z
-    lmats[4] << -1, 0, 0,
+    lmats(4) << -1, 0, 0,
                 -one_over_root2, one_over_root2, 0,
                  0, 0,-1;
 
               // y,x,-z
-    lmats[5] <<  0, 1, 0,
+    lmats(5) <<  0, 1, 0,
                  1, 0, 0,
                  0, 0,-1;
 
@@ -223,32 +221,32 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P -3 1 m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // -y,x-y,z
-    lmats[1] <<  0,-1, 0,
+    lmats(1) <<  0,-1, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0, 1;
 
               // -x+y,-x,z
-    lmats[2] << -one_over_root2, one_over_root2, 0,
+    lmats(2) << -one_over_root2, one_over_root2, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // -y,-x,-z
-    lmats[3] <<  0,-1, 0,
+    lmats(3) <<  0,-1, 0,
                 -1, 0, 0,
                  0, 0,-1;
 
               // -x+y,y,-z
-    lmats[4] << -one_over_root2, one_over_root2, 0,
+    lmats(4) << -one_over_root2, one_over_root2, 0,
                  0, 1, 0,
                  0, 0,-1;
 
               // x,x-y,-z
-    lmats[5] <<  1, 0, 0,
+    lmats(5) <<  1, 0, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0,-1;
 
@@ -258,32 +256,32 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 6/m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // x-y,x,z
-    lmats[1] <<  one_over_root2,-one_over_root2, 0,
+    lmats(1) <<  one_over_root2,-one_over_root2, 0,
                  1, 0, 0,
                  0, 0, 1;
 
               // y,-x+y,z
-    lmats[2] <<  0, 1, 0,
+    lmats(2) <<  0, 1, 0,
                 -one_over_root2, one_over_root2, 0,
                  0, 0, 1;
 
               // -y,x-y,z
-    lmats[3] <<  0,-1, 0,
+    lmats(3) <<  0,-1, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0, 1;
 
               // -x+y,-x,z
-    lmats[4] << -one_over_root2, one_over_root2, 0,
+    lmats(4) << -one_over_root2, one_over_root2, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // -x,-y,z
-    lmats[5] << -1, 0, 0,
+    lmats(5) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
@@ -293,62 +291,62 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P 6/m m m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // x-y,x,z
-    lmats[1] <<  one_over_root2,-one_over_root2, 0,
+    lmats(1) <<  one_over_root2,-one_over_root2, 0,
                  1, 0, 0,
                  0, 0, 1;
 
               // y,-x+y,z
-    lmats[2] <<  0, 1, 0,
+    lmats(2) <<  0, 1, 0,
                 -one_over_root2, one_over_root2, 0,
                  0, 0, 1;
 
               // -y,x-y,z
-    lmats[3] <<  0,-1, 0,
+    lmats(3) <<  0,-1, 0,
                  one_over_root2,-one_over_root2, 0,
                  0, 0, 1;
 
               // -x+y,-x,z
-    lmats[4] << -one_over_root2, one_over_root2, 0,
+    lmats(4) << -one_over_root2, one_over_root2, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // x-y,-y,-z
-    lmats[5] <<  one_over_root2,-one_over_root2, 0,
+    lmats(5) <<  one_over_root2,-one_over_root2, 0,
                  0,-1, 0,
                  0, 0,-1;
 
               // -x,-x+y,-z
-    lmats[6] << -1, 0, 0,
+    lmats(6) << -1, 0, 0,
                 -one_over_root2, one_over_root2, 0,
                  0, 0,-1;
 
               // -x,-y,z
-    lmats[7] << -1, 0, 0,
+    lmats(7) << -1, 0, 0,
                  0,-1, 0,
                  0, 0, 1;
 
               // y,x,-z
-    lmats[8] <<  0, 1, 0,
+    lmats(8) <<  0, 1, 0,
                  1, 0, 0,
                  0, 0,-1;
 
               // -y,-x,-z
-    lmats[9] <<  0,-1, 0,
+    lmats(9) <<  0,-1, 0,
                 -1, 0, 0,
                  0, 0,-1;
 
               // -x+y,y,-z
-    lmats[10] << -one_over_root2, one_over_root2, 0,
+    lmats(10) << -one_over_root2, one_over_root2, 0,
                   0, 1, 0,
                   0, 0,-1;
 
               // x,x-y,-z
-    lmats[11] <<  1, 0, 0,
+    lmats(11) <<  1, 0, 0,
                   one_over_root2,-one_over_root2, 0,
                   0, 0,-1;
 
@@ -358,62 +356,62 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P m -3
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // z,x,y
-    lmats[1] <<  0, 0, 1,
+    lmats(1) <<  0, 0, 1,
                  1, 0, 0,
                  0, 1, 0;
 
               // y,z,x
-    lmats[2] <<  0, 1, 0,
+    lmats(2) <<  0, 1, 0,
                  0, 0, 1,
                  1, 0, 0;
 
               // -y,-z,x
-    lmats[3] <<  0,-1, 0,
+    lmats(3) <<  0,-1, 0,
                  0, 0,-1,
                  1, 0, 0;
 
               // z,-x,-y
-    lmats[4] <<  0, 0, 1,
+    lmats(4) <<  0, 0, 1,
                 -1, 0, 0,
                  0,-1, 0;
 
               // -y,z,-x
-    lmats[5] <<  0,-1, 0,
+    lmats(5) <<  0,-1, 0,
                  0, 0, 1,
                 -1, 0, 0;
 
               // -z,-x,y
-    lmats[6] <<  0, 0,-1,
+    lmats(6) <<  0, 0,-1,
                 -1, 0, 0,
                  0, 1, 0;
 
               // -z,x,-y
-    lmats[7] <<  0, 0,-1,
+    lmats(7) <<  0, 0,-1,
                  1, 0, 0,
                  0,-1, 0;
 
               // y,-z,-x
-    lmats[8] <<  0, 1, 0,
+    lmats(8) <<  0, 1, 0,
                  0, 0,-1,
                 -1, 0, 0;
 
               // x,-y,-z
-    lmats[9] <<  1, 0, 0,
+    lmats(9) <<  1, 0, 0,
                  0,-1, 0,
                  0, 0,-1;
 
               // -x,y,-z
-    lmats[10] << -1, 0, 0,
+    lmats(10) << -1, 0, 0,
                   0, 1, 0,
                   0, 0,-1;
 
               // -x,-y,z
-    lmats[11] << -1, 0, 0,
+    lmats(11) << -1, 0, 0,
                   0,-1, 0,
                   0, 0, 1;
 
@@ -423,122 +421,122 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   // P m -3 m
 
               // x,y,z
-    lmats[0] <<  1, 0, 0,
+    lmats(0) <<  1, 0, 0,
                  0, 1, 0,
                  0, 0, 1;
 
               // x,-z,y
-    lmats[1] <<  1, 0, 0,
+    lmats(1) <<  1, 0, 0,
                  0, 0,-1,
                  0, 1, 0;
 
               // x,z,-y
-    lmats[2] <<  1, 0, 0,
+    lmats(2) <<  1, 0, 0,
                  0, 0, 1,
                  0,-1, 0;
 
               // z,y,-x
-    lmats[3] <<  0, 0, 1,
+    lmats(3) <<  0, 0, 1,
                  0, 1, 0,
                 -1, 0, 0;
 
               // -z,y,x
-    lmats[4] <<  0, 0,-1,
+    lmats(4) <<  0, 0,-1,
                  0, 1, 0,
                  1, 0, 0;
 
               // -y,x,z
-    lmats[5] <<  0,-1, 0,
+    lmats(5) <<  0,-1, 0,
                  1, 0, 0,
                  0, 0, 1;
 
               // y,-x,z
-    lmats[6] <<  0, 1, 0,
+    lmats(6) <<  0, 1, 0,
                 -1, 0, 0,
                  0, 0, 1;
 
               // z,x,y
-    lmats[7] <<  0, 0, 1,
+    lmats(7) <<  0, 0, 1,
                  1, 0, 0,
                  0, 1, 0;
 
               // y,z,x
-    lmats[8] <<  0, 1, 0,
+    lmats(8) <<  0, 1, 0,
                  0, 0, 1,
                  1, 0, 0;
 
               // -y,-z,x
-    lmats[9] <<  0,-1, 0,
+    lmats(9) <<  0,-1, 0,
                  0, 0,-1,
                  1, 0, 0;
 
               // z,-x,-y
-    lmats[10] <<  0, 0, 1,
+    lmats(10) <<  0, 0, 1,
                  -1, 0, 0,
                   0,-1, 0;
 
               // -y,z,-x
-    lmats[11] <<  0,-1, 0,
+    lmats(11) <<  0,-1, 0,
                   0, 0, 1,
                  -1, 0, 0;
 
               // -z,-x,y
-    lmats[12] <<  0, 0,-1,
+    lmats(12) <<  0, 0,-1,
                  -1, 0, 0,
                   0, 1, 0;
 
               // -z,x,-y
-    lmats[13] <<  0, 0,-1,
+    lmats(13) <<  0, 0,-1,
                   1, 0, 0,
                   0,-1, 0;
 
               // y,-z,-x
-    lmats[14] <<  0, 1, 0,
+    lmats(14) <<  0, 1, 0,
                   0, 0,-1,
                  -1, 0, 0;
 
               // x,-y,-z
-    lmats[15] <<  1, 0, 0,
+    lmats(15) <<  1, 0, 0,
                   0,-1, 0,
                   0, 0,-1;
 
               // -x,y,-z
-    lmats[16] << -1, 0, 0,
+    lmats(16) << -1, 0, 0,
                   0, 1, 0,
                   0, 0,-1;
 
               // -x,-y,z
-    lmats[17] << -1, 0, 0,
+    lmats(17) << -1, 0, 0,
                   0,-1, 0,
                   0, 0, 1;
 
               // y,x,-z
-    lmats[18] <<  0, 1, 0,
+    lmats(18) <<  0, 1, 0,
                   1, 0, 0,
                   0, 0,-1;
 
               // -y,-x,-z
-    lmats[19] <<  0,-1, 0,
+    lmats(19) <<  0,-1, 0,
                  -1, 0, 0,
                   0, 0,-1;
 
               // z,-y,x
-    lmats[20] <<  0, 0, 1,
+    lmats(20) <<  0, 0, 1,
                   0,-1, 0,
                   1, 0, 0;
 
               // -z,-y,-x
-    lmats[21] <<  0, 0,-1,
+    lmats(21) <<  0, 0,-1,
                   0,-1, 0,
                  -1, 0, 0;
 
               // -x,z,y
-    lmats[22] << -1, 0, 0,
+    lmats(22) << -1, 0, 0,
                   0, 0, 1,
                   0, 1, 0;
 
               // -x,-z,-y
-    lmats[23] << -1, 0, 0,
+    lmats(23) << -1, 0, 0,
                   0, 0,-1,
                   0,-1, 0;
 
@@ -546,87 +544,78 @@ int gen_laue_mats(int laue_group_num, KOKKOS_MAT3 *lmats, KOKKOS_MAT3 rpa) {
   }
 
   for (int i_mat=0; i_mat < num_mats; i_mat ++){
-    lmats[i_mat] = lmats[i_mat] * rpa;
+    lmats(i_mat) = lmats(i_mat) * rpa;
   }
   return num_mats;
 };
 
 KOKKOS_FUNCTION
-void calc_diffuse_at_hkl(KOKKOS_VEC3 H_vec, KOKKOS_VEC3 H0, KOKKOS_VEC3 dHH, KOKKOS_VEC3 Hmin, KOKKOS_VEC3 Hmax, KOKKOS_VEC3 Hrange, KOKKOS_MAT3 Ainv, const vector_cudareal_t FhklLinear, int num_laue_mats, const KOKKOS_MAT3 *laue_mats, KOKKOS_MAT3 anisoG_local, KOKKOS_MAT3 anisoU_local, const KOKKOS_MAT3 *dG_dgam, bool refine_diffuse, CUDAREAL *I0, CUDAREAL *step_diffuse_param){
-  CUDAREAL four_mpi_sq = 4.*M_PI*M_PI;
-  // loop over laue matrices
-  int num_stencil_points = (2*dHH[0] + 1) * (2*dHH[1] + 1) * (2*dHH[2] + 1);
-  bool h_bounded= (H0[0]+dHH[0]<=Hmax[0]) && (H0[0]-dHH[0]>=Hmin[0]) ;
-  bool k_bounded= (H0[1]+dHH[1]<=Hmax[1]) && (H0[1]-dHH[1]>=Hmin[1]) ;
-  bool l_bounded= (H0[2]+dHH[2]<=Hmax[2]) && (H0[2]-dHH[2]>=Hmin[2]) ;
-  if (h_bounded && k_bounded && l_bounded) {
-    int Fhkl_linear_index_0 = (H0[0]-Hmin[0]) * Hrange[1] * Hrange[2]
-      + (H0[1]-Hmin[1]) * Hrange[2] + (H0[2]-Hmin[2]);
-    CUDAREAL _F_cell_0 = FhklLinear(Fhkl_linear_index_0);
-    KOKKOS_MAT3 Ginv = anisoG_local.inverse();
-    CUDAREAL anisoG_determ = anisoG_local.determinant();
-    for (int hh=-dHH[0]; hh <= dHH[0]; hh++){
-      for (int kk=-dHH[1]; kk <= dHH[1]; kk++){
-        for (int ll=-dHH[2]; ll <= dHH[2]; ll++){
-          CUDAREAL ID_this = 0;
-          CUDAREAL step_diffuse_param_this[6]  = {0,0,0,0,0,0};
-          int Fhkl_linear_index_this = (H0[0]+hh-Hmin[0]) * Hrange[1] * Hrange[2]
-            + (H0[1]+kk-Hmin[1]) * Hrange[2] + (H0[2]+ll-Hmin[2]);
-          CUDAREAL _F_cell_this = FhklLinear(Fhkl_linear_index_this);
-          CUDAREAL _this_diffuse_scale;
-          if (_F_cell_0 != 0.0)
-            _this_diffuse_scale = _F_cell_this/_F_cell_0;
-          else
-            _this_diffuse_scale = 1.0;
-
-          _this_diffuse_scale *= _this_diffuse_scale/(CUDAREAL)num_laue_mats/
-            (CUDAREAL)num_stencil_points;
-          // Use (a-b, a+b, c) as the principal axes of the diffuse model
-          // TODO: Add an option to select (a, b, c) as the principal axes
-
-          for ( int iL = 0; iL < num_laue_mats; iL++ ){
-            KOKKOS_VEC3 Q0 =Ainv*laue_mats[iL]*H0;
-            CUDAREAL exparg = four_mpi_sq*Q0.dot(anisoU_local*Q0);
-            CUDAREAL dwf = exp(-exparg);
-            KOKKOS_VEC3 H0_offset(H0[0]+hh, H0[1]+kk, H0[2]+ll);
-            KOKKOS_VEC3 delta_H_offset = H_vec - H0_offset;
-            KOKKOS_VEC3 delta_Q = Ainv*laue_mats[iL]*delta_H_offset;
-            KOKKOS_VEC3 anisoG_q = anisoG_local*delta_Q;
-
-            CUDAREAL V_dot_V = anisoG_q.dot(anisoG_q);
-            CUDAREAL gamma_portion_denom = (1.+ V_dot_V* four_mpi_sq);
-            gamma_portion_denom *= gamma_portion_denom;
-            CUDAREAL gamma_portion = 8.*M_PI*anisoG_determ /
-              gamma_portion_denom;
-            CUDAREAL this_I_latt_diffuse = dwf*exparg*gamma_portion;
-
-            ID_this += this_I_latt_diffuse;
-            if (refine_diffuse){ // add the contributions to diffuse scattering gradients here
-              for (int i_gam=0; i_gam<3; i_gam++){
-                KOKKOS_VEC3 dV = dG_dgam[i_gam]*delta_Q;
-                CUDAREAL V_dot_dV = anisoG_q.dot(dV);
-                CUDAREAL deriv = (Ginv*dG_dgam[i_gam]).trace() - 4.*four_mpi_sq*V_dot_dV/(1+four_mpi_sq*V_dot_V);
-                step_diffuse_param_this[i_gam] += gamma_portion*deriv*dwf*exparg;
-              }
-              KOKKOS_MAT3 dU_dsigma;
-              // dU_dsigma << 0,0,0,0,0,0,0,0,0;
-              for (int i_sig = 0;i_sig<3; i_sig++){
-                dU_dsigma(i_sig, i_sig) = 2.*sqrt(anisoU_local(i_sig,i_sig));
-                CUDAREAL dexparg = four_mpi_sq*Q0.dot(dU_dsigma*Q0);
-                dU_dsigma(i_sig, i_sig) = 0.;
-                step_diffuse_param_this[i_sig+3] += gamma_portion*dwf*dexparg*(1. - exparg);
-              }
-            }
-          } // end loop over iL (laue group mats)
-          // Update the lattice interference term here to include diffuse scattering (F_latt squared)
-          *I0 += ID_this * _this_diffuse_scale;
-
-          for (int idp=0; idp < 6; idp++)
-            step_diffuse_param[idp] += step_diffuse_param_this[idp]*_this_diffuse_scale;
-        } // end ll loop
-      } // end kk loop
-    } // end hh loop
-  } // end if bounded
+void calc_diffuse_at_hkl(KOKKOS_VEC3 H_vec, KOKKOS_VEC3 H0, KOKKOS_VEC3 dHH, int h_min, int k_min, int l_min, int h_max, int k_max, int l_max, int h_range, int k_range, int l_range, KOKKOS_MAT3 Ainv, const vector_cudareal_t FhklLinear, int num_laue_mats, const vector_mat3_t laue_mats, KOKKOS_MAT3 anisoG_local, vector_cudareal_t dG_trace, CUDAREAL anisoG_determ, KOKKOS_MAT3 anisoU_local, const vector_vec3_t dG_dgam, bool refine_diffuse, CUDAREAL *I0, CUDAREAL *step_diffuse_param){
+   constexpr CUDAREAL four_mpi_sq = 4.*M_PI*M_PI;
+   // loop over laue matrices
+   int num_stencil_points = (2*dHH[0] + 1) * (2*dHH[1] + 1) * (2*dHH[2] + 1);
+   bool h_bounded= (H0[0]+dHH[0]<=h_max) && (H0[0]-dHH[0]>=h_min) ;
+   bool k_bounded= (H0[1]+dHH[1]<=k_max) && (H0[1]-dHH[1]>=k_min) ;
+   bool l_bounded= (H0[2]+dHH[2]<=l_max) && (H0[2]-dHH[2]>=l_min) ;
+   if (h_bounded && k_bounded && l_bounded) {
+         int Fhkl_linear_index_0 = (H0[0]-h_min) * k_range * l_range + (H0[1]-k_min) * l_range + (H0[2]-l_min);
+         const CUDAREAL _F_cell_0 = FhklLinear(Fhkl_linear_index_0);
+         for (int hh=-dHH[0]; hh <= dHH[0]; hh++){
+            for (int kk=-dHH[1]; kk <= dHH[1]; kk++){
+               for (int ll=-dHH[2]; ll <= dHH[2]; ll++){
+                     CUDAREAL ID_this = 0;
+                     CUDAREAL step_diffuse_param_this[6]  = {0,0,0,0,0,0};
+                     const int Fhkl_linear_index_this = hh * k_range * l_range + kk * l_range + ll + Fhkl_linear_index_0;
+                     const CUDAREAL _F_cell_this = FhklLinear(Fhkl_linear_index_this);
+                     CUDAREAL _this_diffuse_scale;
+                     if (_F_cell_0 != 0.0)
+                        _this_diffuse_scale = _F_cell_this/_F_cell_0;
+                     else
+                        _this_diffuse_scale = 1.0;
+
+                     _this_diffuse_scale *= _this_diffuse_scale/(CUDAREAL)num_laue_mats/(CUDAREAL)num_stencil_points;
+
+                     const KOKKOS_VEC3 H0_offset(H0[0]+hh, H0[1]+kk, H0[2]+ll);
+                     const KOKKOS_VEC3 delta_H_offset = H_vec - H0_offset;
+
+                     for ( int iL = 0; iL < num_laue_mats; iL++ ){
+                        const KOKKOS_VEC3 Q0 = laue_mats(iL)*H0;
+                        const CUDAREAL exparg = four_mpi_sq*Q0.dot(anisoU_local*Q0);
+                        const CUDAREAL dwf = exp(-exparg);
+                        const KOKKOS_VEC3 delta_Q = laue_mats(iL)*delta_H_offset;
+                        const KOKKOS_VEC3 anisoG_q = anisoG_local*delta_Q;
+
+                        const CUDAREAL V_dot_V = anisoG_q.length_sqr();
+                        const CUDAREAL gamma_portion_denom = 1 / (1.+ V_dot_V* four_mpi_sq);
+                        const CUDAREAL gamma_portion = 8.*M_PI*anisoG_determ * gamma_portion_denom * gamma_portion_denom;
+                        const CUDAREAL this_I_latt_diffuse = dwf*exparg*gamma_portion;
+
+                        ID_this += this_I_latt_diffuse;
+                        if (refine_diffuse){ // add the contributions to diffuse scattering gradients here
+                           for (int i_gam=0; i_gam<3; i_gam++){
+                                 const CUDAREAL dV = dG_dgam(i_gam).dot(delta_Q);
+                                 const CUDAREAL V_dot_dV = anisoG_q[i_gam] * dV;
+                                 const CUDAREAL deriv = dG_trace(i_gam) - 4.*four_mpi_sq*V_dot_dV*gamma_portion_denom;
+                                 step_diffuse_param_this[i_gam] += gamma_portion*deriv*dwf*exparg;
+                           }
+                           for (int i_sig = 0;i_sig<3; i_sig++){
+                                 const CUDAREAL dexparg = 2 * four_mpi_sq * sqrt(anisoU_local(i_sig,i_sig)) * Q0[i_sig] * Q0[i_sig];
+                                 step_diffuse_param_this[i_sig+3] += gamma_portion*dwf*dexparg*(1. - exparg);
+                           }
+                        }
+                     } // end loop over iL (laue group mats)
+                     // Update the lattice interference term here to include diffuse scattering (F_latt squared)
+                     *I0 += ID_this * _this_diffuse_scale;
+
+                     if (refine_diffuse) {
+                        for (int idp=0; idp < 6; idp++) {
+                           step_diffuse_param[idp] += step_diffuse_param_this[idp]*_this_diffuse_scale;
+                        }
+                     }
+               } // end ll loop
+            } // end kk loop
+         } // end hh loop
+   } // end if bounded
 }
 
 #endif
diff --git a/simtbx/diffBragg/src/util.h b/simtbx/diffBragg/src/util.h
index 3cf9360e55..ec197c1dc9 100644
--- a/simtbx/diffBragg/src/util.h
+++ b/simtbx/diffBragg/src/util.h
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <string>
+#include <sys/time.h>
 
 #ifndef CUDAREAL
     #define CUDAREAL double
@@ -19,14 +20,35 @@ typedef Eigen::Matrix<double,3,3> MAT3;
 typedef std::vector<MAT3,Eigen::aligned_allocator<MAT3> > eigMat3_vec;
 typedef std::vector<VEC3,Eigen::aligned_allocator<VEC3> > eigVec3_vec;
 
+inline void easy_time(double& timer, struct timeval& t, bool recording){
+    double before_sec = t.tv_sec;
+    double before_usec = t.tv_usec;
+    gettimeofday(&t, 0);
+    double time = (1000000.0 * (t.tv_sec - before_sec) + t.tv_usec - before_usec) / 1000.0;
+    if (recording)
+        timer += time;
+}
+
 struct timer_variables{
-    CUDAREAL add_spots_pre=0; // times the initializations for add spots kernel
-    CUDAREAL add_spots_post=0; // times the copies that occur after add spots kernel
-    CUDAREAL add_spots_kernel_wrapper=0; // times the add spots kernel overall, either CPU or GPU
-    CUDAREAL cuda_alloc=0; // times the allocation of the device
-    CUDAREAL cuda_copy_to_dev=0; // times the copying from host to device
-    CUDAREAL cuda_copy_from_dev=0; // times the copying back from device to host
-    CUDAREAL cuda_kernel=0; // times the GPU kernel
+    double add_spots_pre=0; // times the initializations for add spots kernel
+    double add_spots_post=0; // times the copies that occur after add spots kernel
+    double add_spots_kernel_wrapper=0; // times the add spots kernel overall, either CPU or GPU
+    double cuda_alloc=0; // times the allocation of the device
+    double cuda_copy_to_dev=0; // times the copying from host to device
+    double cuda_copy_from_dev=0; // times the copying back from device to host
+    double cuda_kernel=0; // times the GPU kernel
+    double copy_sources=0;
+    double copy_Fhkl_scale=0;
+    double copy_umats=0;
+    double copy_amats=0;
+    double copy_bmats=0;
+    double copy_rotmats=0;
+    double copy_det=0;
+    double copy_nomhkl=0;
+    double copy_flags=0;
+    double copy_fhkl=0;
+    double copy_detderiv=0;
+    double copy_pfs=0;
     int timings=0; // how many times these variables were incremented
     bool recording=true;
   };
@@ -72,16 +94,18 @@ struct cuda_flags{
     int Npix_to_allocate; // how much space to allocate for simulating forward model and gradients
     // these following flags indicate whether to update quantities on the GPU device prior to running the kernel
     // ( of course they are all set prior to running the kernel for the first time)
-    bool update_step_positions;  // step arrays
-    bool update_panels_fasts_slows; // pixels to simulatoe (panel id, fast scan, slow scan)
-    bool update_sources;  // beam sources
-    bool update_umats; // umatrices for mosaic blocks
-    bool update_dB_mats; // derivative of the orthogonalization matrix (for unit cell derivatives)
-    bool update_rotmats; // rotation matrices (for Umat derivatives)
-    bool update_Fhkl; // structure factors
-    bool update_detector; // detector vectors (origin, slow-axis, fast-axis, orth-axis)
-    bool update_refine_flags;  // refinement flags (in case one is iteratively freezing parameters)
-    bool update_panel_deriv_vecs; // if one is refining the detector vectors)
+    bool update_step_positions = false;  // step arrays
+    bool update_panels_fasts_slows = false; // pixels to simulatoe (panel id, fast scan, slow scan)
+    bool update_sources = false;  // beam sources
+    bool update_umats = false; // umatrices for mosaic blocks
+    bool update_dB_mats = false; // derivative of the orthogonalization matrix (for unit cell derivatives)
+    bool update_rotmats = false; // rotation matrices (for Umat derivatives)
+    bool update_Fhkl = false; // structure factors
+    bool update_Fhkl_scales = false; // structure factors
+    bool update_Fhkl_channels = false; // structure factors
+    bool update_detector = false; // detector vectors (origin, slow-axis, fast-axis, orth-axis)
+    bool update_refine_flags = false;  // refinement flags (in case one is iteratively freezing parameters)
+    bool update_panel_deriv_vecs = false; // if one is refining the detector vectors)
 };
 
 struct flags{
@@ -91,30 +115,30 @@ struct flags{
     bool using_trusted_mask=false;
     bool Fhkl_gradient_mode=false;
     bool wavelength_img=false;
-    bool track_Fhkl; // for CPU kernel only, track the HKLS evaluated in the inner most loop
-    bool printout; // whether to printout debug info for a pixel
-    bool nopolar; // disable polarization effects
-    bool point_pixel; // approximate solid angle effects
-    bool only_save_omega_kahn; // only save the polarization and solid angle corrections (deprecated)
-    bool compute_curvatures; // whether to compute the curvatures in addition to gradients
-    bool isotropic_ncells; // one mosaic domain parameter
-    bool complex_miller;  // is the miller array complex (such thet Fhkl_linear and Fhkl2_linear are both defined)
-    bool no_Nabc_scale; // no Nabc prefactor
-    bool refine_diffuse; // flag for computing diffuse gradients
+    bool track_Fhkl = false; // for CPU kernel only, track the HKLS evaluated in the inner most loop
+    bool printout = false; // whether to printout debug info for a pixel
+    bool nopolar = false; // disable polarization effects
+    bool point_pixel = false; // approximate solid angle effects
+    bool only_save_omega_kahn = false; // only save the polarization and solid angle corrections (deprecated)
+    bool compute_curvatures = false; // whether to compute the curvatures in addition to gradients
+    bool isotropic_ncells = false; // one mosaic domain parameter
+    bool complex_miller = false;  // is the miller array complex (such thet Fhkl_linear and Fhkl2_linear are both defined)
+    bool no_Nabc_scale = false; // no Nabc prefactor
+    bool refine_diffuse = false; // flag for computing diffuse gradients
     std::vector<bool> refine_Bmat;  //  Bmatrix
     std::vector<bool> refine_Ncells; // mosaic domain size
-    bool refine_Ncells_def; // mosaic domain size off diag
+    bool refine_Ncells_def = false; // mosaic domain size off diag
     std::vector<bool> refine_panel_origin; // panel shift
     std::vector<bool> refine_panel_rot; // detector panel rotation
-    bool refine_fcell; // structure factor
+    bool refine_fcell = false; // structure factor
     std::vector<bool> refine_lambda; // spectrum affine correction
-    bool refine_eta; // mosaic spread
+    bool refine_eta = false; // mosaic spread
     std::vector<bool> refine_Umat; // missetting angle umatrix
-    bool refine_fp_fdp; // fprime and fbl prime
-    bool use_lambda_coefficients; // affine correction lam0 , lam1
-    bool oversample_omega; // omega is computed separately for each sub-pixel
-    int printout_fpixel, printout_spixel; // debug printout pixel (fast scan, slow scan) // TODO add panel id
-    int verbose; // nanoBragg verbosity flag
+    bool refine_fp_fdp = false; // fprime and fbl prime
+    bool use_lambda_coefficients = false; // affine correction lam0 , lam1
+    bool oversample_omega = false; // omega is computed separately for each sub-pixel
+    int printout_fpixel = 0, printout_spixel = 0; // debug printout pixel (fast scan, slow scan) // TODO add panel id
+    int verbose = 0; // nanoBragg verbosity flag
     bool use_diffuse = false; // model  diffuse
     bool only_diffuse = false; // model  diffuse scattering (experimental)
     bool refine_Icell = false; // option to refine the structure factor intensity directly (F_cell^2)
@@ -203,7 +227,10 @@ struct beam{
 struct detector{
     std::vector<Eigen::Vector3d,Eigen::aligned_allocator<Eigen::Vector3d> > dF_vecs; // derivative of the panel fast direction
     std::vector<Eigen::Vector3d,Eigen::aligned_allocator<Eigen::Vector3d> > dS_vecs; // derivative of the panel slow direction
-    CUDAREAL detector_thickstep, detector_thicksteps, detector_thick, detector_attnlen;
+    CUDAREAL detector_thickstep;
+    int detector_thicksteps;
+    CUDAREAL detector_thick;
+    CUDAREAL detector_attnlen;
     std::vector<CUDAREAL> close_distances; // offsets to the detector origins (Z direction)
     int oversample; // determines the pixel subsampling rate
     CUDAREAL subpixel_size, pixel_size;
diff --git a/simtbx/diffBragg/src/util_kokkos.h b/simtbx/diffBragg/src/util_kokkos.h
index f74a6d35be..7f0a8da0df 100644
--- a/simtbx/diffBragg/src/util_kokkos.h
+++ b/simtbx/diffBragg/src/util_kokkos.h
@@ -23,29 +23,37 @@ inline KOKKOS_VEC3 to_vec3(const Eigen::Vector3d& v) {
 
 inline KOKKOS_MAT3 to_mat3(const Eigen::Matrix3d& m) {
     // Eigen matrix is column-major!
-    return KOKKOS_MAT3(m(0, 0), m(0, 1), m(0, 2), m(1, 0), m(1, 1), m(1, 2), m(2, 0), m(2, 1), m(2, 2));
+    return KOKKOS_MAT3(m(0, 0), m(0, 1), m(0, 2),
+                       m(1, 0), m(1, 1), m(1, 2),
+                       m(2, 0), m(2, 1), m(2, 2));
 }
 
 template <class T, class U>
-inline void transfer(T& target, U& source, int size=-1) {
-    const int length = size>=0 ? size : source.size();
+inline void transfer(T& dst, U& src, int size=-1) {
+    const int length = size>=0 ? size : src.size();
     for (int i=0; i<length; ++i) {
-        target(i) = source[i];
+        dst(i) = src[i];
     }
 }
 
 template <class T, class U>
-inline void transfer_KOKKOS_VEC3(T& target, U& source) {
-    for (int i=0; i<source.size(); ++i) {
-        target(i) = to_vec3(source[i]);
+inline void transfer_KOKKOS_VEC3(T& dst, U& src) {
+    auto host_view = Kokkos::create_mirror_view(dst);
+    auto src_ptr = src.begin();
+    for (int i = 0; i < src.size(); ++i) {
+        host_view(i) = to_vec3(src_ptr[i]);
     }
+    Kokkos::deep_copy(dst, host_view);
 }
 
 template <class T, class U>
-inline void transfer_KOKKOS_MAT3(T& target, U& source) {
-    for (int i=0; i<source.size(); ++i) {
-        target(i) = to_mat3(source[i]);
+inline void transfer_KOKKOS_MAT3(T& dst, U& src) {
+    auto host_view = Kokkos::create_mirror_view(dst);
+    auto src_ptr = src.begin();
+    for (int i = 0; i < src.size(); ++i) {
+        host_view(i) = to_mat3(src_ptr[i]);
     }
+    Kokkos::deep_copy(dst, host_view);
 }
 
 // CONTAINERS
@@ -221,7 +229,8 @@ struct kokkos_beam {
     int number_of_sources;                     // number of beams
 
     kokkos_beam(beam T)
-        : Fhkl_channels(T.Fhkl_channels),
+        : polarization_axis(to_vec3(T.polarization_axis)),
+          Fhkl_channels(T.Fhkl_channels),
           fluence(T.fluence),
           kahn_factor(T.kahn_factor),
           source_X(T.source_X),
@@ -231,14 +240,16 @@ struct kokkos_beam {
           source_I(T.source_I),
           lambda0(T.lambda0),
           lambda1(T.lambda1),
-          number_of_sources(T.number_of_sources),
-          polarization_axis(to_vec3(T.polarization_axis)){ };
+          number_of_sources(T.number_of_sources){ };
 };
 
 struct kokkos_detector {
     std::vector<KOKKOS_VEC3> dF_vecs;  // derivative of the panel fast direction
     std::vector<KOKKOS_VEC3> dS_vecs;  // derivative of the panel slow direction
-    CUDAREAL detector_thickstep, detector_thicksteps, detector_thick, detector_attnlen;
+    CUDAREAL detector_thickstep;
+    int detector_thicksteps;
+    CUDAREAL detector_thick;
+    CUDAREAL detector_attnlen;
     std::vector<CUDAREAL> close_distances;  // offsets to the detector origins (Z direction)
     int oversample;                         // determines the pixel subsampling rate
     CUDAREAL subpixel_size, pixel_size;
@@ -268,4 +279,34 @@ struct kokkos_detector {
 
 };
 
+struct kokkos_manager {
+    KOKKOS_VEC3 rot;
+    double ucell[6] = {0, 0, 0, 0, 0, 0};
+    double Ncells[6] = {0, 0, 0, 0, 0, 0};
+    KOKKOS_VEC3 pan_orig;
+    KOKKOS_VEC3 pan_rot;
+    double fcell = 0;
+    KOKKOS_VEC3 eta;
+    double lambda[2] = {0, 0};
+    double fp_fdp[2] = {0, 0};
+    double diffuse[6] = {0, 0, 0, 0, 0, 0};
+
+    KOKKOS_INLINE_FUNCTION void reset() {
+        for (int i=0; i<6; ++i) {
+            ucell[i] = 0;
+            Ncells[i] = 0;
+            diffuse[i] = 0;
+        }
+        for (int i=0; i<2; ++i) {
+            lambda[i] = 0;
+            fp_fdp[i] = 0;
+        }
+        rot.zero();
+        pan_orig.zero();
+        pan_rot.zero();
+        eta.zero();
+        fcell = 0;
+    }
+};
+
 #endif
diff --git a/simtbx/diffBragg/stage_two_utils.py b/simtbx/diffBragg/stage_two_utils.py
index bc7df9f761..a73c8ac30c 100644
--- a/simtbx/diffBragg/stage_two_utils.py
+++ b/simtbx/diffBragg/stage_two_utils.py
@@ -113,11 +113,8 @@ def PAR_from_params(params, experiment, best=None):
     PAR.Ndef = [None]*3
     PAR.eta = [None]*3
     PAR.RotXYZ_params = [None]*3
-
-    if not params.use_restraints or params.fix.ucell:
-        # dummie values:
-        params.centers.ucell = [1, 1, 1, 1, 1, 1]
-        params.betas.ucell = [1,1,1,1,1,1]
+    PAR.diffuse_sigma = [None]*3
+    PAR.diffuse_gamma = [None]*3
 
     eta_min = params.mins.eta_abc
     init_eta = params.init.eta_abc if best is None else best.eta_abc.values[0]
@@ -126,28 +123,46 @@ def PAR_from_params(params, experiment, best=None):
         if not params.simulator.crystal.num_mosaicity_samples == 1:
             raise ValueError("if all eta_abc are 0,0,0, num_mosaicity_samples should be 1")
 
+    # TODO allow setting diffuse gamma/sigma from stage 1 (e.g. from the `best` dataframe)
+    init_diffuse_sigma = params.init.diffuse_sigma
+    init_diffuse_gamma = params.init.diffuse_gamma
+
     for i in range(3):
         initN = params.init.Nabc[i] if best is None else best.ncells.values[0][i]
         PAR.Nabc[i] = ParameterType(init=initN, minval=params.mins.Nabc[i],
                                     maxval=params.maxs.Nabc[i], fix=params.fix.Nabc, sigma=params.sigmas.Nabc[i],
-                                    center=params.centers.Nabc[i], beta=params.betas.Nabc[i])
+                                    center=params.centers.Nabc[i] if params.centers.Nabc is not None else None,
+                                    beta=params.betas.Nabc[i] if params.betas.Nabc is not None else None)
 
         initN = params.init.Ndef[i] if best is None else best.ncells_def.values[0][i]
         PAR.Ndef[i] = ParameterType(init=initN, minval=params.mins.Ndef[i],
                                     maxval=params.maxs.Ndef[i], fix=params.fix.Ndef, sigma=params.sigmas.Ndef[i],
-                                    center=params.centers.Ndef[i], beta=params.betas.Ndef[i])
+                                    center=params.centers.Ndef[i] if params.centers.Ndef is not None else None,
+                                    beta=params.betas.Ndef[i] if params.betas.Ndef is not None else None)
 
         PAR.RotXYZ_params[i] = ParameterType(init=0, minval=params.mins.RotXYZ[i],
                                             maxval=params.maxs.RotXYZ[i], fix=params.fix.RotXYZ,
                                             sigma=params.sigmas.RotXYZ[i],
-                                            center=0, beta=params.betas.RotXYZ)
+                                            center=0 if params.betas.RotXYZ is not None else None, beta=params.betas.RotXYZ)
 
         PAR.eta[i] = ParameterType(init=init_eta[i], minval=eta_min[i],
                                        maxval=params.maxs.eta_abc[i], fix=params.fix.eta_abc,
                                        sigma=params.sigmas.eta_abc[i],
-                                       center=params.betas.eta_abc[i], beta=params.betas.eta_abc[i])
+                                       center=params.betas.eta_abc[i] if params.centers.eta_abc is not None else None,
+                                   beta=params.betas.eta_abc[i] if params.betas.eta_abc is not None else None)
 
         # TODO: diffuse scattering terms
+        PAR.diffuse_sigma[i] = ParameterType(init=init_diffuse_sigma[i], minval=params.mins.diffuse_sigma[i],
+                                        maxval=params.maxs.diffuse_sigma[i], fix=params.fix.diffuse_sigma,
+                                        sigma=params.sigmas.diffuse_sigma[i],
+                                        center=params.centers.diffuse_sigma[i] if params.centers.diffuse_sigma is not None else None,
+                                             beta=params.betas.diffuse_sigma[i] if params.betas.diffuse_sigma is not None else None)
+
+        PAR.diffuse_gamma[i] = ParameterType(init=init_diffuse_gamma[i], minval=params.mins.diffuse_gamma[i],
+                                             maxval=params.maxs.diffuse_gamma[i], fix=params.fix.diffuse_gamma,
+                                             sigma=params.sigmas.diffuse_gamma[i],
+                                             center=params.centers.diffuse_gamma[i] if params.centers.diffuse_gamma is not None else None,
+                                             beta=params.betas.diffuse_gamma[i] if params.betas.diffuse_gamma is not None else None)
 
     # unit cell parameters
     ucell_man = utils.manager_from_crystal(experiment.crystal)  # Note ucell man contains the best parameters (if best is not None)
@@ -157,12 +172,34 @@ def PAR_from_params(params, experiment, best=None):
         if "Ang" in name:
             minval = val - ucell_vary_perc * val
             maxval = val + ucell_vary_perc * val
+            if name == 'a_Ang':
+                cent = params.centers.ucell_a
+                beta = params.betas.ucell_a
+            elif name == 'b_Ang':
+                cent = params.centers.ucell_b
+                beta = params.betas.ucell_b
+            else:
+                cent = params.centers.ucell_c
+                beta = params.betas.ucell_c
         else:
             val_in_deg = val * 180 / np.pi
             minval = (val_in_deg - params.ucell_ang_abs) * np.pi / 180.
             maxval = (val_in_deg + params.ucell_ang_abs) * np.pi / 180.
+            if name == 'alpha_rad':
+                cent = params.centers.ucell_alpha
+                beta = params.betas.ucell_alpha
+            elif name == 'beta_rad':
+                cent = params.centers.ucell_beta
+                beta = params.betas.ucell_beta
+            else:
+                cent = params.centers.ucell_gamma
+                beta = params.betas.ucell_gamma
+            assert cent is not None
+            assert beta is not None
+            cent = cent * np.pi / 180.
+
         p = ParameterType(init=val, minval=minval, maxval=maxval, fix=params.fix.ucell, sigma=params.sigmas.ucell[i_uc],
-                          center=params.centers.ucell[i_uc], beta=params.betas.ucell[i_uc])
+                          center=cent, beta=beta)
         PAR.ucell.append(p)
     PAR.ucell_man = ucell_man
 
@@ -173,7 +210,7 @@ def PAR_from_params(params, experiment, best=None):
                                    center=params.centers.detz_shift, beta=params.betas.detz_shift)
 
     PAR.B = ParameterType(init=params.init.B, sigma=params.sigmas.B, minval=params.mins.B, maxval=params.maxs.B, fix=True,
-                          center=0, beta=1e8)
+                          center=params.centers.B, beta=params.betas.B)
 
     lam0, lam1 = params.init.spec
     if best is not None:
@@ -181,8 +218,8 @@ def PAR_from_params(params, experiment, best=None):
     PAR.spec_coef = []
     for i_p, init_val in enumerate((lam0, lam1)):
         p = ParameterType(init=init_val, sigma=params.sigmas.spec[i_p],
-                          center=params.centers.spec[i_p],
-                          beta=params.betas.spec[i_p],
+                          center=params.centers.spec[i_p] if params.centers.spec is not None else None,
+                          beta=params.betas.spec[i_p] if params.betas.spec is not None else None,
                           fix=params.fix.spec,
                           minval=params.mins.spec[i_p], maxval=params.maxs.spec[i_p])
         PAR.spec_coef.append(p)
@@ -203,3 +240,5 @@ def __init__(self):
         self.detz_shift = None
         self.paneRot = None
         self.PanXYZ = None
+        self.diffuse_sigma = None
+        self.diffuse_gamma = None
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_Fcell_deriv.py b/simtbx/diffBragg/tests/tst_diffBragg_Fcell_deriv.py
index a24b0bee4a..45e0b6ae98 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_Fcell_deriv.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_Fcell_deriv.py
@@ -152,6 +152,7 @@
     assert l.rvalue > .9999  # this is definitely a line!
     assert l.slope > 0
     assert l.pvalue < 1e-6
+    assert l.intercept < 0.1*l.slope # line should go through origin
     print("Error versus parameter shift fits a line with slope=%2.7g and Correleation Coef=%2.7g" % (l.slope, l.rvalue))
     print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_detdist_derivatives.py b/simtbx/diffBragg/tests/tst_diffBragg_detdist_derivatives.py
index 55f30ef84f..7c14318a5d 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_detdist_derivatives.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_detdist_derivatives.py
@@ -372,6 +372,8 @@
     assert l.rvalue > .99
     assert l.slope > 0
     assert l.pvalue < 1e-6
+    assert l.intercept < 0.1*l.slope
+
 
     if args.curvatures:
         if args.plotlines:
@@ -387,5 +389,7 @@
         assert l.rvalue > .99
         assert l.slope > 0
         assert l.pvalue < 1e-6
+        assert l.intercept < 0.1*l.slope
+
     print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_diffuse_properties.py b/simtbx/diffBragg/tests/tst_diffBragg_diffuse_properties.py
index ca64bc1e98..c09f28597f 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_diffuse_properties.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_diffuse_properties.py
@@ -122,6 +122,7 @@
         assert l.rvalue > .9999  # this is definitely a line!
         assert l.slope > 0
         assert l.pvalue < 1e-6
+        assert l.intercept < 0.1*l.slope # line should go through origin
 
     det_sh = 1024, 1024
     print("OK")
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_eta_derivs.py b/simtbx/diffBragg/tests/tst_diffBragg_eta_derivs.py
index ab3a3c5dd8..d1c0c79819 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_eta_derivs.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_eta_derivs.py
@@ -99,7 +99,7 @@
     SIM.add_air = True
     SIM.add_water = True
     SIM.include_noise = True
-    SIM.D.use_cuda = args.kokkos
+    SIM.D.use_gpu = args.kokkos
     SIM.D.compute_curvatures = args.curvatures
     SIM.D.add_diffBragg_spots()
 
@@ -196,10 +196,12 @@
     assert l.rvalue > .99, "%f" % l.rvalue
     assert l.slope > 0, "%f" % l.slope
     assert l.pvalue < 1e-6, "%f" % l.pvalue
+    assert l.intercept < 0.1*l.slope, "%f" % l.intercept
     if args.curvatures:
       l = linregress(all_shifts2, all_errors2)
       assert l.rvalue > .9999  # this is definitely a line!
       assert l.slope > 0
       assert l.pvalue < 1e-6
+      assert l.intercept < 0.1*l.slope # line should go through origin
     print("OK")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine.py b/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine.py
index 1609175af1..8c4e0f12ea 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine.py
@@ -194,6 +194,13 @@
         P.fix.detz_shift = True
         P.ftol=1e-15
 
+    if args.perturb == ["detz_shift"]:
+        P.fix.detz_shift = False
+        P.fix.ucell=True
+        P.fix.Nabc=True
+        P.fix.G=True
+        P.fix.RotXYZ=True
+
     E.detector = SIM.detector
     E.beam = SIM.D.beam
     E.imageset = make_imageset([img], E.beam, E.detector)
@@ -219,6 +226,7 @@
     P.simulator.structure_factors.mtz_name = mtz_name
     P.simulator.structure_factors.mtz_column = "F(+),F(-)"
     P.niter = 0
+    P.sigmas.RotXYZ = [1,1,1]
     P.logging.parameters=True
     P.niter_per_J = 1
     P.method="L-BFGS-B"
@@ -268,7 +276,10 @@
         # TODO open the pandas output file and optimized expt in outdir and verify the optimized parameters are similar to ground
         exit()
 
+    P.record_device_timings = True
     Eopt,_, Mod, SIM_used_by_hopper, x = hopper_utils.refine(E, refls, P, spec=spec, return_modeler=True)
+    if SIM_used_by_hopper.D.record_timings:
+        SIM_used_by_hopper.D.show_timings(MPI_RANK=0)
 
     G, rotX,rotY, rotZ, Na,Nb,Nc,_,_,_,_,_,_,_,_,_,a,b,c,al,be,ga,detz_shift = hopper_utils.get_param_from_x(x, Mod)
     eta_abc_opt = hopper_utils.get_mosaicity_from_x(x, Mod, SIM_used_by_hopper)
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine_Fhkl.py b/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine_Fhkl.py
index 5846564feb..5be8e6b274 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine_Fhkl.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_hopper_refine_Fhkl.py
@@ -129,6 +129,7 @@
     SIM.D.raw_pixels *= 0
 
     P = phil_scope.extract()
+    P.debug_mode=True
     E = Experiment()
 
     P.init.G = SIM.D.spot_scale
@@ -215,7 +216,9 @@
     P.outdir="_temp_fhkl_refine"
     if args.maxiter is not None:
         P.lbfgs_maxiter = args.maxiter
+    P.record_device_timings = True
     Eopt,_, Mod,SIM_from_hopper, x = hopper_utils.refine(E, refls, P, return_modeler=True, free_mem=False)
+    SIM_from_hopper.D.show_timings(0)
 
     logging.disable()
     print("\nResults\n<><><><><><>")
@@ -331,10 +334,18 @@ def compute_r_factor_with_gt(corrections):
     df[refl_col] = [input_refl]
     P.refiner.load_data_from_refl = True
     P.refiner.check_expt_format = False
+
+    #from simtbx.diffBragg import mpi_logger
+    #P.logging.rank0_level="high"
+    #mpi_logger.setup_logging_from_params(P)
     modelers = load_inputs(df, P, exper_key="opt_exp_name", refls_key=refl_col)
     modelers.outdir=P.outdir
     modelers.prep_for_refinement()
+    print("Minimizing using hopper_ensemble_utils...")
     modelers.Minimize(save=True)
+    if modelers.SIM.D.record_timings:
+        modelers.SIM.D.show_timings(MPI_RANK=0)
+    print("Done!")
 
     from iotbx.reflection_file_reader import any_reflection_file
     opt_F = any_reflection_file("_temp_fhkl_refine/optimized_channel0.mtz").as_miller_arrays()[0]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_lambda_coefficients.py b/simtbx/diffBragg/tests/tst_diffBragg_lambda_coefficients.py
index 386f6cd212..da7557b3b4 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_lambda_coefficients.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_lambda_coefficients.py
@@ -155,6 +155,7 @@
     assert l.rvalue > .9999  # this is definitely a line!
     assert l.slope > 0
     assert l.pvalue < 1e-6
+    assert l.intercept < 0.1*l.slope # line should go through origin
 
     print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py b/simtbx/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py
index 9bf3dfd801..4084080cf6 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py
@@ -136,11 +136,13 @@
         assert l.rvalue > .999  # this is definitely a line!
         assert l.slope > 0
         assert l.pvalue < 1e-5
+        assert l.intercept < 0.1*l.slope # line should go through origin
         if args.curvatures:
             l = linregress(shifts2, all_error2)
             assert l.rvalue > .999  # this is definitely a line!
             assert l.slope > 0
             assert l.pvalue < 1e-5
+            assert l.intercept < 0.1*l.slope # line should go through origin
 
         print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_ncells_property.py b/simtbx/diffBragg/tests/tst_diffBragg_ncells_property.py
index 70e3d28eb7..c3310fb5ca 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_ncells_property.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_ncells_property.py
@@ -128,11 +128,13 @@
     assert l.rvalue > .9999, l  # this is definitely a line!
     assert l.slope > 0
     assert l.pvalue < 1e-6
+    assert l.intercept < 0.1*l.slope # line should go through origin
     if args.curvatures:
         l = linregress(shifts2, all_error2)
         assert l.rvalue > .9999  # this is definitely a line!
         assert l.slope > 0
         assert l.pvalue < 1e-6
+        assert l.intercept < 0.1*l.slope # line should go through origin
 
     print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py b/simtbx/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py
index 8c9dee4013..42a2011ecf 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py
@@ -137,11 +137,13 @@
         assert l.rvalue > .9999  # this is definitely a line!
         assert l.slope > 0
         assert l.pvalue < 1e-6
+        assert l.intercept < 0.1*l.slope # line should go through origin
         if args.curvatures:
             l = linregress(shifts2, all_error2)
             assert l.rvalue > .9999  # this is definitely a line!
             assert l.slope > 0
             assert l.pvalue < 1e-6
+            assert l.intercept < 0.1*l.slope # line should go through origin
 
         print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_panelXY_derivs.py b/simtbx/diffBragg/tests/tst_diffBragg_panelXY_derivs.py
index 2578fa7136..f542ed9372 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_panelXY_derivs.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_panelXY_derivs.py
@@ -121,6 +121,7 @@
     assert l.rvalue > .99
     assert l.slope > 0
     assert l.pvalue < 1e-6
+    assert l.intercept < 0.1*l.slope # line should go through origin
 
     print("OK!")
     for name in find_diffBragg_instances(globals()): del globals()[name]
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ.py b/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ.py
index 730f8f9db9..2e93bc87e1 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ.py
@@ -24,7 +24,7 @@ def main():
     print (angles_XYZ*180 / np.pi)
 
     D = get_diffBragg_instance()
-    D.use_cuda = args.kokkos
+    D.use_gpu = args.kokkos
 
     rotX, rotY, rotZ = 0, 1, 2
     D.refine(rotX)  # rotX
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ_deriv.py b/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ_deriv.py
index 0bd0b89f9f..d8009ed6fa 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ_deriv.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_rotXYZ_deriv.py
@@ -59,7 +59,7 @@
     D.refine(rot_idx)
     D.initialize_managers()
     D.set_value(rot_idx, 0)
-    D.use_cuda = args.kokkos
+    D.use_gpu = args.kokkos
     #D.printout_pixel_fastslow = 786, 567
     D.add_diffBragg_spots()
     img0 = D.raw_pixels_roi.as_numpy_array()
@@ -157,11 +157,13 @@
     assert l.rvalue > .9999, "%2.7g" % l.rvalue
     assert l.slope > 0, "%2.7g" % l.slope
     assert l.pvalue < 1e-6, "%2.7g" % l.pvalue
+    assert l.intercept < 0.1*l.slope, "%2.7g" % l.intercept
     if args.curvatures:
         l = stats.linregress(delta_h2, error_vals2)
         assert l.rvalue > .9999, "2nd deriv rvalue %2.7g" % l.rvalue
         assert l.slope > 0, "2nd deriv slope %2.7g" % l.slope
         assert l.pvalue < 1e-6, "2nd deriv pvalue %2.7g" % l.pvalue
+        assert l.intercept < 0.1*l.slope, "2nd deriv pvalue %2.7g" % l.intercept
     if args.kokkos:
         D.gpu_free()
     print("OK!")
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_unitcell_property.py b/simtbx/diffBragg/tests/tst_diffBragg_unitcell_property.py
index d138a671db..90ef89420d 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_unitcell_property.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_unitcell_property.py
@@ -1,12 +1,10 @@
 from __future__ import division
-##from simtbx.kokkos import gpu_instance
-#kokkos_run = gpu_instance(deviceId = 0)
 
 from argparse import ArgumentParser
 parser = ArgumentParser()
 parser.add_argument("--plot", action='store_true')
 parser.add_argument("--crystalsystem", default='tetragonal',
-                    choices=["monoclinic", "tetragonal"])
+                    choices=["monoclinic", "tetragonal", "hexagonal"])
 parser.add_argument("--curvatures", action='store_true')
 parser.add_argument("--kokkos", action="store_true")
 args = parser.parse_args()
@@ -38,6 +36,9 @@
     if args.crystalsystem=="tetragonal":
         ucell = (55, 55, 77, 90, 90, 90)
         symbol = "P43212"
+    elif args.crystalsystem=="hexagonal":
+        ucell = (55, 55, 77, 90, 90, 120)
+        symbol = "P6522"
     else:  # args.crystalsystem == "monoclinic"
         ucell = (70, 60, 50, 90.0, 110, 90.0)
         symbol = "C121"
@@ -76,6 +77,7 @@
     # and our dxtbx crystal created above
     D = SIM.D
     D.progress_meter = True
+    D.compute_curvatures = args.curvatures
 
     # STEP6:
     # initialize the derivative managers for the unit cell parameters
@@ -178,26 +180,26 @@
                 cc_vals2.append(r2)
             if args.plot:
                 plt.subplot(121)
-                plt.imshow(finite_deriv)
+                plt.imshow(finite_deriv.reshape((img_sh)))
                 plt.title("finite diff")
                 plt.subplot(122)
-                plt.imshow(analy_deriv)
+                plt.imshow(analy_deriv.reshape((img_sh)))
                 plt.title("analytical")
                 plt.draw()
                 plt.suptitle("Shift %d / %d"
                              % (i_shift+1, len(shifts)))
-                plt.pause(0.8)
+                plt.pause(1.2)
                 if args.curvatures:
                     plt.subplot(121)
-                    plt.imshow(finite_second_deriv)
+                    plt.imshow(finite_second_deriv.reshape((img_sh)))
                     plt.title("finite second diff")
                     plt.subplot(122)
-                    plt.imshow(second_derivs[i_param])
+                    plt.imshow(second_derivs[i_param].reshape((img_sh)))
                     plt.title("analytical")
                     plt.draw()
                     plt.suptitle("Shift %d / %d"
                                  % (i_shift + 1, len(shifts)))
-                    plt.pause(0.8)
+                    plt.pause(1.2)
 
         l = linregress(h_vals, error)
 
@@ -205,6 +207,7 @@
         assert l.rvalue > .99
         assert l.slope > 0
         assert l.pvalue < 1e-6
+        assert l.intercept < 0.1*l.slope
 
         if args.curvatures:
             l2 = linregress(np.array(h_vals)**2, error2)
@@ -212,6 +215,7 @@
             assert l2.rvalue > .99
             assert l2.slope > 0
             assert l2.pvalue < 1e-6
+            assert l2.intercept < 0.1*l2.slope
 
         if args.plot:
             plt.close()
diff --git a/simtbx/diffBragg/utils.py b/simtbx/diffBragg/utils.py
index 3692633de0..28e7857820 100644
--- a/simtbx/diffBragg/utils.py
+++ b/simtbx/diffBragg/utils.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, print_function
 import os
+import socket
 import sys
 import re
 from io import StringIO
@@ -29,6 +30,8 @@
 from cctbx.eltbx import henke
 from simtbx.diffBragg import psf
 from dials.algorithms.shoebox import MaskCode
+from xfel.merging.application.utils.memory_usage import get_memory_usage
+
 
 import logging
 MAIN_LOGGER = logging.getLogger("diffBragg.main")
@@ -81,14 +84,22 @@ def label_background_pixels(roi_img, thresh=3.5, iterations=1, only_high=True):
     while iterations > 0:
         if background_pixels is None:
             outliers = is_outlier(img1, thresh)
-            m = np.median(img1[~outliers])
+            inlier_vals = img1[~outliers]
+            if inlier_vals.size:
+                m = np.median(inlier_vals)
+            else:
+                m = np.nan
             if only_high:
                 outliers = np.logical_and(outliers, img1 > m)
             background_pixels = ~outliers
         else:
             where_bg = np.where(background_pixels)[0]
             outliers = is_outlier(img1[background_pixels], thresh)
-            m = np.median(img1[background_pixels][~outliers])
+            inlier_vals = img1[background_pixels][~outliers]
+            if inlier_vals.size:
+                m = np.median(inlier_vals)
+            else:
+                m = np.nan
             if only_high:
                 outliers = np.logical_and(outliers, img1[background_pixels] > m)
             background_pixels[where_bg[outliers]] = False
@@ -101,10 +112,16 @@ def is_outlier(points, thresh=3.5):
     """http://stackoverflow.com/a/22357811/2077270"""
     if len(points.shape) == 1:
         points = points[:, None]
-    median = np.median(points, axis=0)
+    if points.size:
+        median = np.median(points, axis=0)
+    else:
+        median = np.nan
     diff = np.sum((points - median) ** 2, axis=-1)
     diff = np.sqrt(diff)
-    med_abs_deviation = np.median(diff)
+    if diff.size:
+        med_abs_deviation = np.median(diff)
+    else:
+        med_abs_deviation = np.nan
     if med_abs_deviation == 0:
         return np.zeros(points.shape[0], bool)
 
@@ -452,13 +469,13 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
     selection_flags = []
     num_roi_negative_bg = 0
     num_roi_nan_bg = 0
-    background = np.ones(imgs.shape)*-1
+    background = np.full_like(imgs, -1, dtype=float)
     i_roi = 0
     while i_roi < len(rois):
         roi = rois[i_roi]
         i1, i2, j1, j2 = roi
         is_selected = True
-        MAIN_LOGGER.debug("Reflection %d bounded by x1=%d,x2=%d,y1=%d,y2=%d" % (i_roi, i1,i2,j1,j2))
+        refl_bbox_str = "Reflection %d bounded by x1=%d,x2=%d,y1=%d,y2=%d" % (i_roi, i1,i2,j1,j2)
         if is_on_edge[i_roi] and reject_edge_reflections:
             MAIN_LOGGER.debug("Reflection %d is on edge" % i_roi)
             is_selected = False
@@ -474,6 +491,14 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
                 MAIN_LOGGER.debug("reflection %d has too many (%d) hot pixels (%d allowed)!" % (i_roi, num_hotpix, min_trusted_pix_per_roi))
                 is_selected = False
 
+        # Before padding and fitting, test for overlaps and shrink if needed
+        is_overlapping = not np.all(background[pid, j1:j2, i1:i2] == -1)
+        if not allow_overlaps and is_overlapping:
+            MAIN_LOGGER.debug("region of interest already accounted for roi size= %d %d" % (i2-i1, j2-j1))
+            rois[i_roi] = (i1 + 1, i2, j1 + 1, j2) if (i1 + i2) % 2 \
+                else (i1, i2 - 1, j1, j2 - 1)  # shrink alternately from corners
+            continue
+
         dimY, dimX = imgs[pid].shape
         j1_nopad = j1
         i1_nopad = i1
@@ -487,6 +512,10 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
 
         shoebox = imgs[pid, j1:j2, i1:i2]
 
+        if shoebox.size < 4:
+            MAIN_LOGGER.debug("reflection %d has negative background" % i_roi)
+            is_selected = False
+
         if not isinstance(sigma_rdout, float) and not isinstance(sigma_rdout, int):
             shoebox_sigma_readout = sigma_rdout[pid, j1:j2, i1:i2]
         else:
@@ -495,13 +524,19 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
         if background_mask is not None:
             is_background = background_mask[pid, j1:j2, i1:i2]
         else:
-            is_background = label_background_pixels(shoebox,thresh=bg_thresh, iterations=2, only_high=only_high)
+            if shoebox.shape== (0,0):
+                is_background = shoebox.copy().astype(bool)
+            else:
+                is_background = label_background_pixels(shoebox,thresh=bg_thresh, iterations=2, only_high=only_high)
 
         Ycoords, Xcoords = np.indices((j2-j1, i2-i1))
 
         if use_robust_estimation:
             bg_pixels = shoebox[is_background]
-            bg_signal = np.median(bg_pixels)
+            if not bg_pixels.size:
+                bg_signal = np.nan
+            else:
+                bg_signal = np.median(bg_pixels)
             if bg_signal < 0:
                 num_roi_negative_bg += 1
                 if set_negative_bg_to_zero:
@@ -509,6 +544,8 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
                 elif skip_roi_with_negative_bg:
                     MAIN_LOGGER.debug("reflection %d has negative background" % i_roi)
                     is_selected = False
+                elif np.isnan(bg_signal):
+                    is_selected = False
             tilt_a, tilt_b, tilt_c = 0, 0, bg_signal
             covariance = None
 
@@ -524,7 +561,7 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
                 MAIN_LOGGER.debug("tilt fit failed for reflection %d, probably too few pixels" % i_roi)
                 tilt_plane = np.zeros_like(Xcoords)
             else:
-                MAIN_LOGGER.debug("successfully fit tilt plane")
+                #MAIN_LOGGER.debug("successfully fit tilt plane")
                 (tilt_a, tilt_b, tilt_c), covariance = fit_results
                 tilt_plane = tilt_a * Xcoords + tilt_b * Ycoords + tilt_c
                 if np.any(np.isnan(tilt_plane)) and is_selected:
@@ -536,14 +573,6 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
                     MAIN_LOGGER.debug("reflection %d has tilt plane that dips below 0" % i_roi)
                     is_selected = False
 
-        is_overlapping = not np.all(background[pid, j1_nopad:j2_nopad, i1_nopad:i2_nopad] == -1)
-
-        if not allow_overlaps and is_overlapping:
-            # NOTE : move away from this option, it potentially moves the pixel centroid outside of the ROI (in very rare instances)
-            MAIN_LOGGER.debug("region of interest already accounted for roi size= %d %d" % (i2_nopad-i1_nopad, j2_nopad-j1_nopad))
-            rois[i_roi] = i1_nopad+1,i2_nopad,j1_nopad+1,j2_nopad
-            continue
-
         # unpadded ROI dimension
         roi_dimY = j2_nopad-j1_nopad
         roi_dimX = i2_nopad-i1_nopad
@@ -561,6 +590,8 @@ def get_roi_background_and_selection_flags(refls, imgs, shoebox_sz=10, reject_ed
         kept_rois.append(roi)
         panel_ids.append(pid)
         selection_flags.append(is_selected)
+        if not is_selected:
+            MAIN_LOGGER.debug("--> %s was not selected for above reasons" % refl_bbox_str)
         i_roi += 1
 
     MAIN_LOGGER.debug("Number of skipped ROI with negative BGs: %d / %d" % (num_roi_negative_bg, len(rois)))
@@ -650,7 +681,7 @@ def fit_plane_equation_to_background_pixels(shoebox_img, fit_sel, sigma_rdout=3,
     # vector of residuals
     r = rho_bg - np.dot(A, (t1, t2, t3))
     Nbg = len(rho_bg)
-    with np.errstate(invalid='ignore'):
+    with np.errstate(divide='ignore', invalid='ignore'):
         r_fact = np.dot(r.T, np.dot(W, r)) / (Nbg - 3)  # 3 parameters fit
     var_covar = AWA_inv * r_fact  # TODO: check for correlations in the off diagonal elems
 
@@ -705,6 +736,7 @@ def image_data_from_expt(expt, as_double=True):
         raise ValueError("imageset should have only 1 shot. This expt has imageset with %d shots" % len(iset))
     try:
         flex_data = iset.get_raw_data(0)
+
     except Exception as err:
         assert str(type(err)) == "<class 'Boost.Python.ArgumentError'>", "something weird going on with imageset data"
         flex_data = iset.get_raw_data()
@@ -748,6 +780,9 @@ def simulator_for_refinement(expt, params):
     else:
         MAIN_LOGGER.info("Will not use mosaic models, as simulator.crystal.num_mosaicity_samples=1")
 
+    if not params.fix.eta_abc:
+        assert SIM.D.mosaic_domains > 1
+
     if not params.fix.diffuse_gamma or not params.fix.diffuse_sigma:
         assert params.use_diffuse_models
     SIM.D.use_diffuse = params.use_diffuse_models
@@ -1385,15 +1420,20 @@ def refls_to_hkl(refls, detector, beam, crystal,
         return np.vstack(HKL).T, np.vstack(HKLi).T
 
 
-def get_panels_fasts_slows(expt, pids, rois):
+def get_panels_fasts_slows(expt, pids, rois, img_sh=None):
     """
     :param expt: dxtbx experiment
     :param pids: panel ids
     :param rois: regions of interest
+    :param img_sh: 3-tuple Npan, Nslow, Nfast
     :return:
     """
-    npan = len(expt.detector)
-    nfast, nslow = expt.detector[0].get_image_size()
+    if expt is not None:
+        npan = len(expt.detector)
+        nfast, nslow = expt.detector[0].get_image_size()
+    else:
+        assert img_sh is not None
+        npan, nslow, nfast = img_sh
     MASK = np.zeros((npan, nslow, nfast), bool)
     ROI_ID = np.zeros((npan, nslow, nfast), 'uint16')
     #ROI_ID = NP_ONES((npan, nslow, nfast), 'uint16') * mx
@@ -1727,3 +1767,52 @@ def find_diffBragg_instances(globe_objs):
         if "simtbx_diffBragg_ext.diffBragg" in str(obj):
             inst_names.append(name)
     return inst_names
+
+
+def memory_report(prefix='Memory usage'):
+    """Return a string documenting memory usage; to be used with LOGGER.info"""
+    memory_usage_in_gb = get_memory_usage() / 1024.
+    host = socket.gethostname()
+    return "%s: %f GB on node %s" % (prefix, memory_usage_in_gb, host)
+
+
+def smooth(x, beta=10.0, window_size=11):
+    """
+    https://glowingpython.blogspot.com/2012/02/convolution-with-numpy.html
+
+    Apply a Kaiser window smoothing convolution.
+
+    Parameters
+    ----------
+    x : ndarray, float
+        The array to smooth.
+
+    Optional Parameters
+    -------------------
+    beta : float
+        Parameter controlling the strength of the smoothing -- bigger beta
+        results in a smoother function.
+    window_size : int
+        The size of the Kaiser window to apply, i.e. the number of neighboring
+        points used in the smoothing.
+
+    Returns
+    -------
+    smoothed : ndarray, float
+        A smoothed version of `x`.
+    """
+
+    # make sure the window size is odd
+    if window_size % 2 == 0:
+        window_size += 1
+
+    # apply the smoothing function
+    s = np.r_[x[window_size - 1:0:-1], x, x[-1:-window_size:-1]]
+    w = np.kaiser(window_size, beta)
+    y = np.convolve(w / w.sum(), s, mode='valid')
+
+    # remove the extra array length convolve adds
+    b = int((window_size - 1) / 2)
+    smoothed = y[b:len(y) - b]
+
+    return smoothed
diff --git a/simtbx/gpu/simulation.cu b/simtbx/gpu/simulation.cu
index af29f670e7..24b2274cb5 100644
--- a/simtbx/gpu/simulation.cu
+++ b/simtbx/gpu/simulation.cu
@@ -92,7 +92,8 @@ namespace af = scitbx::af;
     simtbx::gpu::gpu_detector & gdt,
     double const& weight
   ){
-        cudaSafeCall(cudaSetDevice(SIM.device_Id));
+        SCITBX_ASSERT(SIM.device_Id == stash_device_Id);
+        cudaSafeCall(cudaSetDevice(stash_device_Id));
 
         // transfer source_I, source_lambda
         // the int arguments are for sizes of the arrays
@@ -107,7 +108,7 @@ namespace af = scitbx::af;
         cu_current_channel_Fhkl = gec.d_channel_Fhkl[ichannel];
 
         cudaDeviceProp deviceProps = { 0 };
-        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
+        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, stash_device_Id));
         int smCount = deviceProps.multiProcessorCount;
         dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
         dim3 numBlocks(smCount * 8, 1);
@@ -185,7 +186,8 @@ namespace af = scitbx::af;
     simtbx::gpu::gpu_detector & gdt,
     af::shared<std::size_t> const active_pixel_list
   ){
-        cudaSafeCall(cudaSetDevice(SIM.device_Id));
+        SCITBX_ASSERT(SIM.device_Id == stash_device_Id);
+        cudaSafeCall(cudaSetDevice(stash_device_Id));
 
         gdt.set_active_pixels_on_GPU(active_pixel_list);
 
@@ -198,7 +200,7 @@ namespace af = scitbx::af;
         cu_current_channel_Fhkl = gec.d_channel_Fhkl[ichannel];
 
         cudaDeviceProp deviceProps = { 0 };
-        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
+        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, stash_device_Id));
         int smCount = deviceProps.multiProcessorCount;
         dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
         dim3 numBlocks(smCount * 8, 1);
@@ -249,7 +251,8 @@ namespace af = scitbx::af;
 
   void
   exascale_api::add_background(simtbx::gpu::gpu_detector & gdt, int const& override_source){
-        cudaSafeCall(cudaSetDevice(SIM.device_Id));
+        SCITBX_ASSERT(SIM.device_Id == stash_device_Id);
+        cudaSafeCall(cudaSetDevice(stash_device_Id));
 
         // transfer source_I, source_lambda
         // the int arguments are for sizes of the arrays
@@ -265,7 +268,7 @@ namespace af = scitbx::af;
         cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_Fbg_of, SIM.Fbg_of, SIM.stols));
 
         cudaDeviceProp deviceProps = { 0 };
-        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
+        cudaSafeCall(cudaGetDeviceProperties(&deviceProps, stash_device_Id));
         int smCount = deviceProps.multiProcessorCount;
         dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
         dim3 numBlocks(smCount * 8, 1);
@@ -314,7 +317,8 @@ namespace af = scitbx::af;
 
   void
   exascale_api::allocate(){
-    cudaSafeCall(cudaSetDevice(SIM.device_Id));
+    SCITBX_ASSERT(SIM.device_Id == stash_device_Id);
+    cudaSafeCall(cudaSetDevice(stash_device_Id));
 
     /* water_size not defined in class, CLI argument, defaults to 0 */
     double water_size = 0.0;
@@ -386,7 +390,7 @@ namespace af = scitbx::af;
   };
 
   exascale_api::~exascale_api(){
-    cudaSafeCall(cudaSetDevice(SIM.device_Id));
+    cudaSafeCall(cudaSetDevice(stash_device_Id));
 
         cudaSafeCall(cudaFree(cu_beam_vector));
         cudaSafeCall(cudaFree(cu_spindle_vector));
diff --git a/simtbx/gpu/simulation.h b/simtbx/gpu/simulation.h
index 8cf384f862..b518196494 100644
--- a/simtbx/gpu/simulation.h
+++ b/simtbx/gpu/simulation.h
@@ -14,7 +14,7 @@ namespace af = scitbx::af;
 struct exascale_api {
   inline
   exascale_api(const simtbx::nanoBragg::nanoBragg& nB):
-    SIM(nB){
+    SIM(nB),stash_device_Id(nB.device_Id){
   }
 
   void show();
@@ -38,6 +38,7 @@ struct exascale_api {
   ~exascale_api();
 
   const simtbx::nanoBragg::nanoBragg& SIM;
+  const int stash_device_Id; // must remain the same after initialization
   CUDAREAL * cu_current_channel_Fhkl;
 
   CUDAREAL cu_subpixel_size;
diff --git a/simtbx/kokkos/SConscript b/simtbx/kokkos/SConscript
index e9bcae54b6..2407718563 100644
--- a/simtbx/kokkos/SConscript
+++ b/simtbx/kokkos/SConscript
@@ -1,5 +1,4 @@
 import os
-import subprocess
 from shutil import copy, which
 
 import libtbx.load_env
diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp
index 85715e3077..0c310551ca 100644
--- a/simtbx/kokkos/detector.cpp
+++ b/simtbx/kokkos/detector.cpp
@@ -16,50 +16,46 @@ using Kokkos::deep_copy;
 using Kokkos::create_mirror_view;
 using Kokkos::parallel_for;
 
+auto get_kokkos_vec3 = [](auto&& src) { return vec3(src[0], src[1], src[2]); };
+
 namespace simtbx { namespace Kokkos {
 
   packed_metrology::packed_metrology(dxtbx::model::Detector const & arg_detector,
                                    dxtbx::model::Beam const & arg_beam) {
 
     for (std::size_t panel_id = 0; panel_id < arg_detector.size(); panel_id++){
-          // helper code arising from the nanoBragg constructor, with user_beam=True
-      typedef scitbx::vec3<double> vec3;
+      // helper code arising from the nanoBragg constructor, with user_beam=True
 
       // DETECTOR properties
       // typically: 1 0 0
-      vec3 fdet_vector = arg_detector[panel_id].get_fast_axis();
-      fdet_vector = fdet_vector.normalize();
+      vec3 fdet_vector = get_kokkos_vec3(arg_detector[panel_id].get_fast_axis());
+      fdet_vector.normalize();
 
       // typically: 0 -1 0
-      vec3 sdet_vector = arg_detector[panel_id].get_slow_axis();
-      sdet_vector = sdet_vector.normalize();
+      vec3 sdet_vector = get_kokkos_vec3(arg_detector[panel_id].get_slow_axis());
+      sdet_vector.normalize();
 
       // set orthogonal vector to the detector pixel array
       vec3 odet_vector = fdet_vector.cross(sdet_vector);
-      odet_vector = odet_vector.normalize();
+      odet_vector.normalize();
 
       // dxtbx origin is location of outer corner of the first pixel
-      vec3 pix0_vector = arg_detector[panel_id].get_origin()/1000.0;
+      vec3 pix0_vector = get_kokkos_vec3(arg_detector[panel_id].get_origin()/1000.0);
 
       // what is the point of closest approach between sample and detector?
-      double close_distance = pix0_vector * odet_vector;
+      double close_distance = pix0_vector.dot(odet_vector);
       if (close_distance < 0){
         bool verbose = false;
         if(verbose)printf("WARNING: dxtbx model is lefthanded. Inverting odet_vector.\n");
         odet_vector = -1. * odet_vector;
-        close_distance = -1*close_distance;
+        close_distance = -1 * close_distance;
       }
 
-      sdet.push_back(sdet_vector.length());
-      fdet.push_back(fdet_vector.length());
-      odet.push_back(odet_vector.length());
-      pix0.push_back(0.);
-      for (std::size_t idx_vec = 0; idx_vec < 3; idx_vec++){
-            sdet.push_back(sdet_vector[idx_vec]);
-            fdet.push_back(fdet_vector[idx_vec]);
-            odet.push_back(odet_vector[idx_vec]);
-            pix0.push_back(pix0_vector[idx_vec]);
-      }
+      sdet.push_back(sdet_vector);
+      fdet.push_back(fdet_vector);
+      odet.push_back(odet_vector);
+      pix0.push_back(pix0_vector);
+
       // set beam centre
       scitbx::vec2<double> dials_bc=arg_detector[panel_id].get_beam_centre(arg_beam.get_s0());
       dists.push_back(close_distance);
@@ -69,28 +65,28 @@ namespace simtbx { namespace Kokkos {
   };
 
   packed_metrology::packed_metrology(const simtbx::nanoBragg::nanoBragg& nB){
-      for (std::size_t idx_vec = 0; idx_vec < 4; idx_vec++){
-            sdet.push_back(nB.sdet_vector[idx_vec]);
-            fdet.push_back(nB.fdet_vector[idx_vec]);
-            odet.push_back(nB.odet_vector[idx_vec]);
-            pix0.push_back(nB.pix0_vector[idx_vec]);
-      }
-      dists.push_back(nB.close_distance);
-      Xbeam.push_back(nB.Xbeam);
-      Ybeam.push_back(nB.Ybeam);
+    // Careful, 4-vectors! [length, x, y, z]
+    auto get_kokkos_vec3 = [](auto& src) { return vec3(src[1], src[2], src[3]); };
+    sdet.push_back( get_kokkos_vec3(nB.sdet_vector) );
+    fdet.push_back( get_kokkos_vec3(nB.fdet_vector) );
+    odet.push_back( get_kokkos_vec3(nB.odet_vector) );
+    pix0.push_back( get_kokkos_vec3(nB.pix0_vector) );
+    dists.push_back(nB.close_distance);
+    Xbeam.push_back(nB.Xbeam);
+    Ybeam.push_back(nB.Ybeam);
   }
 
   void
   packed_metrology::show() const {
     for (std::size_t idx_p = 0; idx_p < Xbeam.size(); idx_p++){
       printf(" Panel %3ld\n",idx_p);
-      printf(" Panel %3ld sdet %9.6f %9.6f %9.6f %9.6f fdet %9.6f %9.6f %9.6f %9.6f\n",
-             idx_p,sdet[4*idx_p+0],sdet[4*idx_p+1],sdet[4*idx_p+2],sdet[4*idx_p+3],
-                   fdet[4*idx_p+0],fdet[4*idx_p+1],fdet[4*idx_p+2],fdet[4*idx_p+3]
+      printf(" Panel %3ld sdet %9.6f %9.6f %9.6f fdet %9.6f %9.6f %9.6f\n",
+             idx_p,sdet[idx_p][0],sdet[idx_p][1],sdet[idx_p][2],
+                   fdet[idx_p][0],fdet[idx_p][1],fdet[idx_p][2]
       );
-      printf(" Panel %3ld odet %9.6f %9.6f %9.6f %9.6f pix0 %9.6f %9.6f %9.6f %9.6f\n",
-             idx_p,odet[4*idx_p+0],odet[4*idx_p+1],odet[4*idx_p+2],odet[4*idx_p+3],
-                   pix0[4*idx_p+0],pix0[4*idx_p+1],pix0[4*idx_p+2],pix0[4*idx_p+3]
+      printf(" Panel %3ld odet %9.6f %9.6f %9.6f pix0 %9.6f %9.6f %9.6f\n",
+             idx_p,odet[idx_p][0],odet[idx_p][1],odet[idx_p][2],
+                   pix0[idx_p][0],pix0[idx_p][1],pix0[idx_p][2]
       );
       printf(" Panel %3ld beam %11.8f %11.8f\n",idx_p,Xbeam[idx_p],Ybeam[idx_p]);
     }
diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h
index ced5c4b87d..03e3913e17 100644
--- a/simtbx/kokkos/detector.h
+++ b/simtbx/kokkos/detector.h
@@ -7,7 +7,7 @@
 //   3) the associated simulated data, in process of being accumulated by kernel calls
 //   4) mask data
 //   5) possibly other metadata
-
+#include <iostream>
 
 #include "scitbx/array_family/shared.h"
 #include "scitbx/array_family/flex_types.h"
@@ -15,8 +15,12 @@
 #include "dxtbx/model/beam.h"
 #include "simtbx/nanoBragg/nanoBragg.h"
 #include "kokkostbx/kokkos_types.h"
+#include "kokkostbx/kokkos_vector3.h"
+#include "kokkostbx/kokkos_matrix3.h"
+
+using vec3 = kokkostbx::vector3<CUDAREAL>;
+using mat3 = kokkostbx::matrix3<CUDAREAL>;
 
-#include <iostream>
 
 namespace simtbx { namespace Kokkos {
 
@@ -27,10 +31,10 @@ struct packed_metrology{
   packed_metrology(dxtbx::model::Detector const &,dxtbx::model::Beam const &);
   packed_metrology(const simtbx::nanoBragg::nanoBragg& nB);
   void show() const;
-  af::shared<double>sdet;
-  af::shared<double>fdet;
-  af::shared<double>odet;
-  af::shared<double>pix0;
+  af::shared<vec3>sdet;
+  af::shared<vec3>fdet;
+  af::shared<vec3>odet;
+  af::shared<vec3>pix0;
   af::shared<double>dists;
   af::shared<double>Xbeam;
   af::shared<double>Ybeam;
@@ -83,10 +87,10 @@ struct kokkos_detector{
   vector_float_t m_floatimage = vector_float_t("m_floatimage", 0);
 
   // all-panel packed GPU representation of the multi-panel metrology
-  vector_cudareal_t m_sdet_vector = vector_cudareal_t("m_sdet_vector", 0);
-  vector_cudareal_t m_fdet_vector = vector_cudareal_t("m_fdet_vector", 0);
-  vector_cudareal_t m_odet_vector = vector_cudareal_t("m_odet_vector", 0);
-  vector_cudareal_t m_pix0_vector = vector_cudareal_t("m_pix0_vector", 0);
+  view_1d_t<vec3> m_sdet_vector = view_1d_t<vec3>("m_sdet_vector", 0);
+  view_1d_t<vec3> m_fdet_vector = view_1d_t<vec3>("m_fdet_vector", 0);
+  view_1d_t<vec3> m_odet_vector = view_1d_t<vec3>("m_odet_vector", 0);
+  view_1d_t<vec3> m_pix0_vector = view_1d_t<vec3>("m_pix0_vector", 0);
   vector_cudareal_t m_distance = vector_cudareal_t("m_distance", 0);
   vector_cudareal_t m_Xbeam = vector_cudareal_t("m_Xbeam", 0);
   vector_cudareal_t m_Ybeam = vector_cudareal_t("m_Ybeam", 0);
diff --git a/simtbx/kokkos/kernel_math.h b/simtbx/kokkos/kernel_math.h
index b140ec91d9..f55e38013e 100644
--- a/simtbx/kokkos/kernel_math.h
+++ b/simtbx/kokkos/kernel_math.h
@@ -24,6 +24,7 @@ KOKKOS_INLINE_FUNCTION static void cross_product(CUDAREAL *x, CUDAREAL *y, CUDAR
 /* rotate a 3-vector about a unit vector axis */
 KOKKOS_INLINE_FUNCTION static CUDAREAL *rotate_axis(const vector_cudareal_t v, CUDAREAL * newv, const vector_cudareal_t axis, const CUDAREAL phi);
 KOKKOS_INLINE_FUNCTION static CUDAREAL *rotate_axis(const CUDAREAL * v, CUDAREAL * newv, const vector_cudareal_t axis, const CUDAREAL phi);
+KOKKOS_INLINE_FUNCTION static vector3<CUDAREAL> rotate_axis(const vector3<CUDAREAL>& v, vector3<CUDAREAL>& newv, const vector3<CUDAREAL>& axis, const CUDAREAL phi);
 /* rotate a 3-vector using a 9-element unitary matrix */
 KOKKOS_INLINE_FUNCTION static void rotate_umat(CUDAREAL * v, CUDAREAL *newv, const CUDAREAL * __restrict__ umat);
 // measure magnitude of vector and put it in 0th element
@@ -201,6 +202,18 @@ KOKKOS_INLINE_FUNCTION CUDAREAL *rotate_axis(const CUDAREAL * v, CUDAREAL * newv
         return newv;
 }
 
+/* rotate a point about a unit vector axis */
+KOKKOS_INLINE_FUNCTION vector3<CUDAREAL> rotate_axis(const vector3<CUDAREAL>& v, vector3<CUDAREAL>& newv, const vector3<CUDAREAL>& axis, const CUDAREAL phi) {
+
+        const CUDAREAL sinphi = sin(phi);
+        const CUDAREAL cosphi = cos(phi);
+        const CUDAREAL dot = axis.dot(v) * (1.0 - cosphi);
+
+        newv = axis * dot + v * cosphi + axis.cross(v) * sinphi;
+
+        return newv;
+}
+
 /* rotate a vector using a 9-element unitary matrix */
 KOKKOS_INLINE_FUNCTION void rotate_umat(CUDAREAL * v, CUDAREAL *newv, const CUDAREAL * __restrict__ umat) {
 
diff --git a/simtbx/kokkos/kokkos_instance.cpp b/simtbx/kokkos/kokkos_instance.cpp
index 2624459b8e..d0b467501d 100644
--- a/simtbx/kokkos/kokkos_instance.cpp
+++ b/simtbx/kokkos/kokkos_instance.cpp
@@ -1,6 +1,6 @@
 #include "simtbx/kokkos/kokkos_instance.h"
 
-using Kokkos::InitArguments;
+using Kokkos::InitializationSettings;
 using Kokkos::initialize;
 using Kokkos::finalize;
 
@@ -15,11 +15,9 @@ namespace Kokkos {
   }
 
   kokkos_instance::kokkos_instance(int const& t_deviceID) {
-    InitArguments kokkos_init;
-    kokkos_init.device_id = t_deviceID;
-
     if (!m_isInitialized) {
-      initialize(kokkos_init);
+      initialize(InitializationSettings()
+                      .set_device_id(t_deviceID));
 
       m_isInitialized = true;
       m_isFinalized = false;
diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp
index dc3d5f9aa4..23f960ff08 100644
--- a/simtbx/kokkos/simulation.cpp
+++ b/simtbx/kokkos/simulation.cpp
@@ -100,8 +100,6 @@ namespace Kokkos {
     simtbx::Kokkos::kokkos_detector & kdt,
     double const& weight
   ){
-    // cudaSafeCall(cudaSetDevice(SIM.device_Id));
-
     // transfer source_I, source_lambda
     // the int arguments are for sizes of the arrays
     int source_count = SIM.sources;
@@ -110,18 +108,15 @@ namespace Kokkos {
     for (std::size_t iwt = 0; iwt < source_count; iwt++){wptr[iwt] = weight*(SIM.source_I[iwt]);}
     kokkostbx::transfer_double2kokkos(m_source_I, wptr, source_count);
     kokkostbx::transfer_double2kokkos(m_source_lambda, SIM.source_lambda, source_count);
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_I, SIM.source_I, SIM.sources));
-    // cudaSafeCall(cudaMemcpyVectorDoubleToDevice(cu_source_lambda, SIM.source_lambda, SIM.sources));
+
+    ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3);
+    calc_CrystalOrientations(
+      SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread,
+      SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation);
 
     // magic happens here(?): take pointer from singleton, temporarily use it for add Bragg iteration:
     vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannel];
 
-    // cudaDeviceProp deviceProps = { 0 };
-    // cudaSafeCall(cudaGetDeviceProperties(&deviceProps, SIM.device_Id));
-    // int smCount = deviceProps.multiProcessorCount;
-    // dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
-    // dim3 numBlocks(smCount * 8, 1);
-
     std::size_t panel_size = kdt.m_slow_dim_size * kdt.m_fast_dim_size;
 
     // the for loop around panels.  Offsets given.
@@ -132,16 +127,14 @@ namespace Kokkos {
       SIM.roi_xmax, SIM.roi_ymin, SIM.roi_ymax, SIM.oversample, SIM.point_pixel,
       SIM.pixel_size, m_subpixel_size, m_steps, SIM.detector_thickstep, SIM.detector_thicksteps,
       SIM.detector_thick, SIM.detector_attnlen,
-      extract_subview(kdt.m_sdet_vector, panel_id, m_vector_length),
-      extract_subview(kdt.m_fdet_vector, panel_id, m_vector_length),
-      extract_subview(kdt.m_odet_vector, panel_id, m_vector_length),
-      extract_subview(kdt.m_pix0_vector, panel_id, m_vector_length),
+      extract_subview(kdt.m_sdet_vector, panel_id, 1),
+      extract_subview(kdt.m_fdet_vector, panel_id, 1),
+      extract_subview(kdt.m_odet_vector, panel_id, 1),
+      extract_subview(kdt.m_pix0_vector, panel_id, 1),
       SIM.curved_detector, kdt.metrology.dists[panel_id], kdt.metrology.dists[panel_id], m_beam_vector,
       kdt.metrology.Xbeam[panel_id], kdt.metrology.Ybeam[panel_id],
-      SIM.dmin, SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector,
-      SIM.sources, m_source_X, m_source_Y, m_source_Z,
-      m_source_I, m_source_lambda, m_a0, m_b0,
-      m_c0, SIM.xtal_shape, SIM.mosaic_spread, SIM.mosaic_domains, m_mosaic_umats,
+      SIM.dmin, SIM.phisteps, SIM.sources, m_source_X, m_source_Y, m_source_Z,
+      m_source_I, m_source_lambda, SIM.xtal_shape, SIM.mosaic_domains, m_crystal_orientation,
       SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell,
       m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, SIM.fluence,
       simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, SIM.default_F,
@@ -380,10 +373,10 @@ namespace Kokkos {
           SIM.oversample, override_source,
           SIM.pixel_size, kdt.m_slow_dim_size, kdt.m_fast_dim_size, SIM.detector_thicksteps,
           SIM.detector_thickstep, SIM.detector_attnlen,
-          extract_subview(kdt.m_sdet_vector, panel_id, m_vector_length),
-          extract_subview(kdt.m_fdet_vector, panel_id, m_vector_length),
-          extract_subview(kdt.m_odet_vector, panel_id, m_vector_length),
-          extract_subview(kdt.m_pix0_vector, panel_id, m_vector_length),
+          extract_subview(kdt.m_sdet_vector, panel_id, 1),
+          extract_subview(kdt.m_fdet_vector, panel_id, 1),
+          extract_subview(kdt.m_odet_vector, panel_id, 1),
+          extract_subview(kdt.m_pix0_vector, panel_id, 1),
           kdt.metrology.dists[panel_id], SIM.point_pixel, SIM.detector_thick,
           m_source_X, m_source_Y, m_source_Z,
           m_source_lambda, m_source_I,
diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h
index 23f8cc0ddb..f1fc1c68ca 100644
--- a/simtbx/kokkos/simulation.h
+++ b/simtbx/kokkos/simulation.h
@@ -7,6 +7,12 @@
 #include "simtbx/kokkos/structure_factors.h"
 #include "simtbx/kokkos/detector.h"
 #include "kokkostbx/kokkos_types.h"
+#include "kokkostbx/kokkos_vector3.h"
+#include "kokkostbx/kokkos_matrix3.h"
+
+using vec3 = kokkostbx::vector3<CUDAREAL>;
+using mat3 = kokkostbx::matrix3<CUDAREAL>;
+using crystal_orientation_t = Kokkos::View<vec3***, Kokkos::LayoutRight, MemSpace>; // [phisteps, domains, 3]
 
 namespace simtbx { namespace Kokkos {
 
@@ -62,6 +68,7 @@ struct exascale_api {
   vector_cudareal_t m_source_I = vector_cudareal_t("m_source_I", 0);
   vector_cudareal_t m_source_lambda = vector_cudareal_t("m_source_lambda", 0);
   vector_cudareal_t m_mosaic_umats = vector_cudareal_t("m_mosaic_umats", 0);
+  crystal_orientation_t m_crystal_orientation = crystal_orientation_t("m_crystal_orientation", 0, 0, 3);
   CUDAREAL m_water_size = 0;
   CUDAREAL m_water_F = 0;
   CUDAREAL m_water_MW = 0;
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index 84c4152423..eb368a95db 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -16,25 +16,72 @@ using simtbx::nanoBragg::GAUSS;
 using simtbx::nanoBragg::GAUSS_ARGCHK;
 using simtbx::nanoBragg::TOPHAT;
 
+void calc_CrystalOrientations(CUDAREAL phi0,
+                              CUDAREAL phistep,
+                              int phisteps,
+                              const vector_cudareal_t spindle_vector,
+                              const vector_cudareal_t a0,
+                              const vector_cudareal_t b0,
+                              const vector_cudareal_t c0,
+                              CUDAREAL mosaic_spread,
+                              int mosaic_domains,
+                              const vector_cudareal_t mosaic_umats,
+                              crystal_orientation_t crystal_orientation) {
+
+        Kokkos::parallel_for("calc_CrystalOrientation", phisteps, KOKKOS_LAMBDA(const int& phi_tic) {
+                // sweep over phi angles
+                CUDAREAL phi = phistep * phi_tic + phi0;
+
+                vec3 spindle_vector_tmp {spindle_vector(1), spindle_vector(2), spindle_vector(3)};
+                vec3 a0_tmp {a0(1), a0(2), a0(3)};
+                vec3 b0_tmp {b0(1), b0(2), b0(3)};
+                vec3 c0_tmp {c0(1), c0(2), c0(3)};
+
+                // rotate about spindle if necessary
+                vec3 ap = a0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
+                vec3 bp = b0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
+                vec3 cp = c0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
+
+                // enumerate mosaic domains
+                for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) {
+                        // apply mosaic rotation after phi rotation
+                        vec3 a, b, c;
+
+                        if (mosaic_spread > 0.0) {
+                                mat3 umat;
+                                for (int i=0; i<9; ++i) {
+                                        umat[i] = mosaic_umats(mos_tic * 9 + i);
+                                }
+                                a = umat.dot(ap);
+                                b = umat.dot(bp);
+                                c = umat.dot(cp);
+                        } else {
+                                a = ap;
+                                b = bp;
+                                c = cp;
+                        }
+                        crystal_orientation(phi_tic, mos_tic, 0) = a;
+                        crystal_orientation(phi_tic, mos_tic, 1) = b;
+                        crystal_orientation(phi_tic, mos_tic, 2) = c;
+                }
+        });
+}
+
 
 void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
     int roi_ymin, int roi_ymax, int oversample, int point_pixel,
     CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps, CUDAREAL detector_thickstep,
     int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu,
-    const vector_cudareal_t sdet_vector, const vector_cudareal_t fdet_vector,
-    const vector_cudareal_t odet_vector, const vector_cudareal_t pix0_vector,
+    const view_1d_t<vec3> sdet_vector, const view_1d_t<vec3> fdet_vector,
+    const view_1d_t<vec3> odet_vector, const view_1d_t<vec3> pix0_vector,
     int curved_detector, CUDAREAL distance, CUDAREAL close_distance,
      const vector_cudareal_t beam_vector,
-    CUDAREAL Xbeam, CUDAREAL Ybeam, CUDAREAL dmin, CUDAREAL phi0, CUDAREAL phistep,
-    int phisteps, const vector_cudareal_t spindle_vector, int sources,
+    CUDAREAL Xbeam, CUDAREAL Ybeam, CUDAREAL dmin, int phisteps, int sources,
     const vector_cudareal_t source_X, const vector_cudareal_t source_Y,
     const vector_cudareal_t source_Z,
     const vector_cudareal_t source_I, const vector_cudareal_t source_lambda,
-    const vector_cudareal_t a0, const vector_cudareal_t b0,
-    const vector_cudareal_t c0, shapetype xtal_shape, CUDAREAL mosaic_spread,
-    int mosaic_domains, const vector_cudareal_t mosaic_umats,
-    CUDAREAL Na, CUDAREAL Nb,
-    CUDAREAL Nc, CUDAREAL V_cell,
+    shapetype xtal_shape, int mosaic_domains, crystal_orientation_t crystal_orientation,
+    CUDAREAL Na, CUDAREAL Nb, CUDAREAL Nc, CUDAREAL V_cell,
     CUDAREAL water_size, CUDAREAL water_F, CUDAREAL water_MW, CUDAREAL r_e_sqr,
     CUDAREAL fluence, CUDAREAL Avogadro, CUDAREAL spot_scale, int integral_form,
     CUDAREAL default_F,
@@ -57,23 +104,17 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
 
         const int total_pixels = spixels * fpixels;
 
-        // add background from something amorphous
-        CUDAREAL F_bg = water_F;
-        CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW;
+        const CUDAREAL distance_r = 1 / distance;
+        const CUDAREAL dmin_r = (dmin > 0.0) ? 1/dmin : 0.0;
 
-       Kokkos::parallel_for("kokkosSpotsKernel", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) {
+        // add background from something amorphous, precalculate scaling
+        const CUDAREAL F_bg = water_F;
+        const CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW;
+        const CUDAREAL I_factor = r_e_sqr * spot_scale * fluence / steps;
 
-                vec3 sdet_tmp {sdet_vector(1), sdet_vector(2), sdet_vector(3)};
-                vec3 fdet_tmp {fdet_vector(1), fdet_vector(2), fdet_vector(3)};
-                vec3 odet_tmp {odet_vector(1), odet_vector(2), odet_vector(3)};
-                vec3 pix0_tmp {pix0_vector(1), pix0_vector(2), pix0_vector(3)};
+       Kokkos::parallel_for("kokkosSpotsKernel", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) {
 
                 vec3 beam_vector_tmp {beam_vector(1), beam_vector(2), beam_vector(3)};
-                vec3 spindle_vector_tmp {spindle_vector(1), spindle_vector(2), spindle_vector(3)};
-
-                vec3 a0_tmp {a0(1), a0(2), a0(3)};
-                vec3 b0_tmp {b0(1), b0(2), b0(3)};
-                vec3 c0_tmp {c0(1), c0(2), c0(3)};
 
                 vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)};
 
@@ -127,38 +168,35 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
                                         //                      pixel_Y = Sdet-Ybeam;
                                         //                      pixel_Z = Fdet-Xbeam;
                                         vec3 pixel_pos;
-                                        pixel_pos += Fdet * fdet_tmp;
-                                        pixel_pos += Sdet * sdet_tmp;
-                                        pixel_pos += Odet * odet_tmp;
-                                        pixel_pos += pix0_tmp;
-
-                                        CUDAREAL pixel_pos_tmp[] = {0, pixel_pos[0], pixel_pos[1], pixel_pos[2]};
+                                        pixel_pos += Fdet * fdet_vector(0);
+                                        pixel_pos += Sdet * sdet_vector(0);
+                                        pixel_pos += Odet * odet_vector(0);
+                                        pixel_pos += pix0_vector(0);
 
                                         if (curved_detector) {
                                                 // construct detector pixel that is always "distance" from the sample
                                                 vec3 dbvector = distance * vec3{beam_vector(1), beam_vector(2), beam_vector(3)};
                                                 // treat detector pixel coordinates as radians
-                                                vec3 newvector = dbvector.rotate_around_axis(sdet_tmp, pixel_pos.y_val() / distance );
-                                                pixel_pos = newvector.rotate_around_axis(fdet_tmp, pixel_pos.z_val() / distance );
+                                                vec3 newvector = dbvector.rotate_around_axis(sdet_vector(0), pixel_pos.y_val() * distance_r );
+                                                pixel_pos = newvector.rotate_around_axis(fdet_vector(0), pixel_pos.z_val() * distance_r );
                                         }
 
                                         // construct the diffracted-beam unit vector to this sub-pixel
-                                        CUDAREAL airpath = pixel_pos.length();
+                                        CUDAREAL airpath_r = 1 / pixel_pos.length();
                                         vec3 diffracted = pixel_pos.get_unit_vector();
 
                                         // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta)
-                                        CUDAREAL omega_pixel = pixel_size * pixel_size / airpath / airpath * close_distance / airpath;
+                                        CUDAREAL omega_pixel = pixel_size * pixel_size * airpath_r * airpath_r * close_distance * airpath_r;
                                         // option to turn off obliquity effect, inverse-square-law only
                                         if (point_pixel) {
-                                                omega_pixel = 1.0 / airpath / airpath;
+                                                omega_pixel = airpath_r * airpath_r;
                                         }
 
                                         // now calculate detector thickness effects
                                         CUDAREAL capture_fraction = 1.0;
                                         if (detector_thick > 0.0 && detector_mu> 0.0) {
                                                 // inverse of effective thickness increase
-                                                vec3 odet{odet_vector(1), odet_vector(2), odet_vector(3)};
-                                                CUDAREAL parallax = odet.dot(diffracted);
+                                                CUDAREAL parallax = odet_vector(0).dot(diffracted);
                                                 capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax)
                                                                 - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax);
                                         }
@@ -182,7 +220,8 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
 
                                                 // rough cut to speed things up when we aren't using whole detector
                                                 if (dmin > 0.0 && stol > 0.0) {
-                                                        if (dmin > 0.5 / stol) {
+                                                        // use reciprocal of (dmin > 0.5 / stol)
+                                                        if (dmin_r <= 2 * stol) {
                                                                 continue;
                                                         }
                                                 }
@@ -197,34 +236,12 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
 
                                                 // sweep over phi angles
                                                 for (int phi_tic = 0; phi_tic < phisteps; ++phi_tic) {
-                                                        CUDAREAL phi = phistep * phi_tic + phi0;
-
-                                                        // rotate about spindle if necessary
-                                                        vec3 ap = a0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
-                                                        vec3 bp = b0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
-                                                        vec3 cp = c0_tmp.rotate_around_axis(spindle_vector_tmp, phi);
-
                                                         // enumerate mosaic domains
                                                         for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) {
                                                                 // apply mosaic rotation after phi rotation
-                                                                // CUDAREAL a[4];
-                                                                // CUDAREAL b[4];
-                                                                // CUDAREAL c[4];
-                                                                vec3 a, b, c;
-
-                                                                if (mosaic_spread > 0.0) {
-                                                                        mat3 umat;
-                                                                        for (int i=0; i<9; ++i) {
-                                                                                umat[i] = mosaic_umats(mos_tic * 9 + i);
-                                                                        }
-                                                                        a = umat.dot(ap);
-                                                                        b = umat.dot(bp);
-                                                                        c = umat.dot(cp);
-                                                                } else {
-                                                                        a = ap;
-                                                                        b = bp;
-                                                                        c = cp;
-                                                                }
+                                                                auto a = crystal_orientation(phi_tic, mos_tic, 0);
+                                                                auto b = crystal_orientation(phi_tic, mos_tic, 1);
+                                                                auto c = crystal_orientation(phi_tic, mos_tic, 2);
 
                                                                 // construct fractional Miller indicies
                                                                 CUDAREAL h = a.dot(scattering);
@@ -257,7 +274,10 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
                                                                         }
                                                                 } else {
                                                                         // handy radius in reciprocal space, squared
-                                                                        hrad_sqr = (h - h0) * (h - h0) * Na * Na + (k - k0) * (k - k0) * Nb * Nb + (l - l0) * (l - l0) * Nc * Nc;
+                                                                        const CUDAREAL hrad = (h - h0) * Na;
+                                                                        const CUDAREAL krad = (k - k0) * Nb;
+                                                                        const CUDAREAL lrad = (l - l0) * Nc;
+                                                                        hrad_sqr = hrad * hrad + krad * krad + lrad * lrad;
                                                                 }
                                                                 if (xtal_shape == ROUND) {
                                                                         // use sinc3 for elliptical xtal shape,
@@ -304,19 +324,13 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
                                                                 // convert amplitudes into intensity (photons per steradian)
                                                                 I += F_cell * F_cell * F_latt * F_latt * source_fraction * capture_fraction * omega_pixel;
                                                                 omega_sub_reduction += omega_pixel;
-                                                        }
-                                                        // end of mosaic loop
-                                                }
-                                                // end of phi loop
-                                        }
-                                        // end of source loop
-                                }
-                                // end of detector thickness loop
-                        }
-                        // end of sub-pixel y loop
-                }
-                // end of sub-pixel x loop
-                const double photons = I_bg + (r_e_sqr * spot_scale * fluence * polar * I) / steps;
+                                                        } // end of mosaic loop
+                                                } // end of phi loop
+                                        } // end of source loop
+                                } // end of detector thickness loop
+                        } // end of sub-pixel y loop
+                } // end of sub-pixel x loop
+                const double photons = I_bg + I_factor * polar * I;
                 floatimage( pixIdx ) = photons;
                 omega_reduction( pixIdx ) = omega_sub_reduction; // shared contention
                 max_I_x_reduction( pixIdx ) = max_I_x_sub_reduction;
@@ -330,8 +344,8 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi
     CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps,
     CUDAREAL detector_thickstep, int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu,
     const int vec_len,
-    const vector_cudareal_t sdet_vector, const vector_cudareal_t fdet_vector,
-    const vector_cudareal_t odet_vector, const vector_cudareal_t pix0_vector,
+    const view_1d_t<vec3> sdet_vector, const view_1d_t<vec3> fdet_vector,
+    const view_1d_t<vec3> odet_vector, const view_1d_t<vec3> pix0_vector,
     const vector_cudareal_t distance, const vector_cudareal_t close_distance,
     const vector_cudareal_t beam_vector,
     const vector_cudareal_t Xbeam, const vector_cudareal_t Ybeam, // not even used, after all the work
@@ -419,19 +433,18 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi
                                         //                      pixel_Z = Fdet-Xbeam;
                                         //CUDAREAL * pixel_pos = tmpVector1;
                                         CUDAREAL pixel_pos[4];
-                                        int iVL = vec_len * i_panel;
-                                        pixel_pos[1] = Fdet * fdet_vector(iVL+1)
-                                                     + Sdet * sdet_vector(iVL+1)
-                                                     + Odet * odet_vector(iVL+1)
-                                                                    + pix0_vector(iVL+1); // X
-                                        pixel_pos[2] = Fdet * fdet_vector(iVL+2)
-                                                     + Sdet * sdet_vector(iVL+2)
-                                                     + Odet * odet_vector(iVL+2)
-                                                            + pix0_vector(iVL+2); // Y
-                                        pixel_pos[3] = Fdet * fdet_vector(iVL+3)
-                                                     + Sdet * sdet_vector(iVL+3)
-                                                     + Odet * odet_vector(iVL+3)
-                                                            + pix0_vector(iVL+3); // Z
+                                        pixel_pos[1] = Fdet * fdet_vector(i_panel)[0]
+                                                     + Sdet * sdet_vector(i_panel)[0]
+                                                     + Odet * odet_vector(i_panel)[0]
+                                                            + pix0_vector(i_panel)[0]; // X
+                                        pixel_pos[2] = Fdet * fdet_vector(i_panel)[1]
+                                                     + Sdet * sdet_vector(i_panel)[1]
+                                                     + Odet * odet_vector(i_panel)[1]
+                                                            + pix0_vector(i_panel)[1]; // Y
+                                        pixel_pos[3] = Fdet * fdet_vector(i_panel)[2]
+                                                     + Sdet * sdet_vector(i_panel)[2]
+                                                     + Odet * odet_vector(i_panel)[2]
+                                                            + pix0_vector(i_panel)[2]; // Z
 
                                         // construct the diffracted-beam unit vector to this sub-pixel
                                         //CUDAREAL * diffracted = tmpVector2;
@@ -450,9 +463,9 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi
                                         if (detector_thick > 0.0 && detector_mu> 0.0) {
                                                 // inverse of effective thickness increase
                                                 CUDAREAL odet[4];
-                                                odet[1] = odet_vector(iVL+1);
-                                                odet[2] = odet_vector(iVL+2);
-                                                odet[3] = odet_vector(iVL+3);
+                                                odet[1] = odet_vector(i_panel)[0];
+                                                odet[2] = odet_vector(i_panel)[1];
+                                                odet[3] = odet_vector(i_panel)[2];
                                                 CUDAREAL parallax = dot_product(odet, diffracted);
                                                 capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax)
                                                                 - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax);
@@ -615,8 +628,8 @@ void add_array( view_1d_t<T> lhs, const view_1d_t<U> rhs ) {
 void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source,
     CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps,
     CUDAREAL detector_thickstep, CUDAREAL detector_attnlen,
-    const vector_cudareal_t  sdet_vector, const vector_cudareal_t  fdet_vector,
-    const vector_cudareal_t  odet_vector, const vector_cudareal_t  pix0_vector,
+    const view_1d_t<vec3> sdet_vector, const view_1d_t<vec3> fdet_vector,
+    const view_1d_t<vec3> odet_vector, const view_1d_t<vec3> pix0_vector,
     CUDAREAL close_distance, int point_pixel, CUDAREAL detector_thick,
     const vector_cudareal_t  source_X, const vector_cudareal_t  source_Y,
     const vector_cudareal_t  source_Z,
@@ -653,11 +666,6 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove
     // const int stride = fstride * sstride;
     Kokkos::parallel_for("add_background", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) {
 
-        vec3 sdet_tmp {sdet_vector(1), sdet_vector(2), sdet_vector(3)};
-        vec3 fdet_tmp {fdet_vector(1), fdet_vector(2), fdet_vector(3)};
-        vec3 odet_tmp {odet_vector(1), odet_vector(2), odet_vector(3)};
-        vec3 pix0_tmp {pix0_vector(1), pix0_vector(2), pix0_vector(3)};
-
         vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)};
 
         const int fpixel = pixIdx % fpixels;
@@ -675,19 +683,11 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove
                 for(int thick_tic=0; thick_tic<detector_thicksteps; ++thick_tic) {
                     // assume "distance" is to the front of the detector sensor layer
                     CUDAREAL Odet = thick_tic*detector_thickstep;
-                //     CUDAREAL pixel_pos[4];
-
-                //     pixel_pos[0] = 0.0;
-                //     pixel_pos[1] = Fdet * fdet_vector(1) + Sdet * sdet_vector(1) + Odet * odet_vector(1) + pix0_vector(1); // X
-                //     pixel_pos[2] = Fdet * fdet_vector(2) + Sdet * sdet_vector(2) + Odet * odet_vector(2) + pix0_vector(2); // Y
-                //     pixel_pos[3] = Fdet * fdet_vector(3) + Sdet * sdet_vector(3) + Odet * odet_vector(3) + pix0_vector(3); // Z
 
-                    vec3 pixel_pos = Fdet * fdet_tmp + Sdet * sdet_tmp + Odet * odet_tmp + pix0_tmp;
+                    vec3 pixel_pos = Fdet * fdet_vector(0) + Sdet * sdet_vector(0) + Odet * odet_vector(0) + pix0_vector(0);
 
                     // no curved detector option (future implementation)
                     // construct the diffracted-beam unit vector to this pixel
-                //     CUDAREAL diffracted[4];
-                //     CUDAREAL airpath = unitize(pixel_pos, diffracted);
                     CUDAREAL airpath = pixel_pos.length();
                     vec3 diffracted = pixel_pos.get_unit_vector();
 
@@ -701,7 +701,7 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove
                     if(detector_thick > 0.0){
                         // inverse of effective thickness increase
                         // CUDAREAL parallax = diffracted[1] * odet_vector(1) + diffracted[2] * odet_vector(2) + diffracted[3] * odet_vector(3);
-                        CUDAREAL parallax = diffracted.dot(odet_tmp);
+                        CUDAREAL parallax = diffracted.dot(odet_vector(0));
                         capture_fraction = exp(-thick_tic*detector_thickstep/detector_attnlen/parallax)
                                             -exp(-(thick_tic+1)*detector_thickstep/detector_attnlen/parallax);
                     }
@@ -710,26 +710,15 @@ void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int ove
                     for(int source=source_start; source<sources; ++source) {
 
                         // retrieve stuff from cache
-                        // CUDAREAL incident[4];
-                        // incident[1] = -source_X(source);
-                        // incident[2] = -source_Y(source);
-                        // incident[3] = -source_Z(source);
                         vec3 incident {-source_X(source), -source_Y(source), -source_Z(source)};
                         CUDAREAL lambda = source_lambda(source);
                         CUDAREAL source_fraction = full_spectrum * source_I(source) + n_source_scale;
                         // construct the incident beam unit vector while recovering source distance
                         incident.normalize();
-                        // unitize(incident, incident);
 
                         // construct the scattering vector for this pixel
                         vec3 scattering = (diffracted - incident) / lambda;
-                        // CUDAREAL scattering[4];
-                        // scattering[1] = (diffracted[1]-incident[1])/lambda;
-                        // scattering[2] = (diffracted[2]-incident[2])/lambda;
-                        // scattering[3] = (diffracted[3]-incident[3])/lambda;
-                        // magnitude(scattering);
                         // sin(theta)/lambda is half the scattering vector length
-                        // CUDAREAL stol = 0.5*scattering[0];
                         CUDAREAL stol = 0.5*scattering.length();
 
                         // now we need to find the nearest four "stol file" points
diff --git a/simtbx/modeling/forward_models.py b/simtbx/modeling/forward_models.py
index 11ffd184ed..b6538c9400 100644
--- a/simtbx/modeling/forward_models.py
+++ b/simtbx/modeling/forward_models.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pandas
 from simtbx.nanoBragg.utils import flexBeam_sim_colors
+from cctbx import miller
+from cctbx.array_family import flex
 
 try:
     from simtbx.gpu import gpu_energy_channels
@@ -75,12 +77,18 @@ def model_spots_from_pandas(pandas_frame,  rois_per_panel=None,
         assert gpu_energy_channels is not None, "cant use exascale api if not in a GPU build"
         assert multipanel_sim is not None, "cant use exascale api if LS49: https://github.com/nksauter/LS49.git  is not configured\n install in the modules folder"
 
-    df = pandas_frame
+    df = pandas_frame.reset_index(drop=True)
 
     if not quiet: LOGGER.info("Loading experiment models")
-    expt_name = df.opt_exp_name.values[0]
+    expt_name = df.exp_name.values[0]
     El = ExperimentListFactory.from_json_file(expt_name, check_format=False)
-    expt = El[0]
+    exp_idx = 0
+    if "exp_idx" in list(df):
+        exp_idx = int(df.exp_idx.values[0])  # cast to int because has to be 32-bit
+    expt = El[exp_idx]
+    crystal = expt.crystal
+    crystal.set_A(df.Amats.values[0])
+    expt.crystal = crystal
     columns = list(df)
     if "detz_shift_mm" in columns:  # NOTE, this could also be inside expt_name directly
         expt.detector = utils.shift_panelZ(expt.detector, df.detz_shift_mm.values[0])
@@ -137,23 +145,21 @@ def model_spots_from_pandas(pandas_frame,  rois_per_panel=None,
     if mtz_file is not None:
         assert mtz_col is not None
         Famp = utils.open_mtz(mtz_file, mtz_col)
-    elif from_pdb is not None:
-        if from_pdb.name is not None:
-            wavelength=None
-            if from_pdb.add_anom:
-                wavelength = expt.beam.get_wavelength()
-            miller_data = utils.get_complex_fcalc_from_pdb(from_pdb.name,
-                                                     dmin=d_min,
-                                                     dmax=d_max,
-                                                     wavelength=wavelength,
-                                                     k_sol=from_pdb.k_sol,
-                                                     b_sol=from_pdb.b_sol)
-            Famp = miller_data.as_amplitude_array()
+        Famp = miller.array(Famp.set(), data=flex.double(len(Famp.data()), np.mean(Famp.data())))
+    elif from_pdb is not None and from_pdb.name is not None:
+        wavelength=None
+        if from_pdb.add_anom:
+            wavelength = expt.beam.get_wavelength()
+        miller_data = utils.get_complex_fcalc_from_pdb(from_pdb.name,
+                                                 dmin=d_min,
+                                                 dmax=d_max,
+                                                 wavelength=wavelength,
+                                                 k_sol=from_pdb.k_sol,
+                                                 b_sol=from_pdb.b_sol)
+        Famp = miller_data.as_amplitude_array()
     else:
         Famp = utils.make_miller_array_from_crystal(expt.crystal, dmin=d_min, dmax=d_max, defaultF=defaultF, symbol=symbol_override)
 
-
-
     diffuse_params = None
     if "use_diffuse_models" in columns and df.use_diffuse_models.values[0]:
         if not use_db:
@@ -189,13 +195,15 @@ def model_spots_from_pandas(pandas_frame,  rois_per_panel=None,
     elif use_db:
         mos_dom = 1
         if "num_mosaicity_samples" in list(df):
-            mos_dom = df.num_mosaicity_samples.values[0]
+            mos_dom = int(df.num_mosaicity_samples.values[0])
         eta_abc = df.eta_abc.values[0]
         LOGGER.debug("Num mos samples=%d, eta_abc=" % mos_dom, eta_abc)
+        LOGGER.debug("Num energy channels=%d" % len(energies))
         results = diffBragg_forward(CRYSTAL=expt.crystal, DETECTOR=expt.detector, BEAM=expt.beam, Famp=Famp,
                                     fluxes=fluxes, energies=energies, beamsize_mm=beamsize_mm,
                                     Ncells_abc=Ncells_abc, spot_scale_override=spot_scale,
                                     mos_dom=mos_dom, eta_abc=df.eta_abc.values[0],
+                                    default_F=np.mean(Famp.data()),
                                     device_Id=device_Id, oversample=oversample,
                                     show_params=not quiet,
                                     nopolar=nopolar,
@@ -236,13 +244,14 @@ def diffBragg_forward(CRYSTAL, DETECTOR, BEAM, Famp, energies, fluxes,
                       nopolar=False, diffuse_params=None, cuda=False,
                       show_timings=False,perpixel_wavelen=False,
                       det_thicksteps=None, eta_abc=None, Ncells_def=None,
-                      num_phi_steps=1, delta_phi=None):
+                      num_phi_steps=1, delta_phi=None, div_mrad=0):
 
     if cuda:
         os.environ["DIFFBRAGG_USE_CUDA"] = "1"
     CRYSTAL, Famp = nanoBragg_utils.ensure_p1(CRYSTAL, Famp)
 
     nbBeam = NBbeam()
+    nbBeam.divergence = div_mrad / 1e3 * 180 / np.pi
     nbBeam.size_mm = beamsize_mm
     nbBeam.unit_s0 = BEAM.get_unit_s0()
     wavelengths = utils.ENERGY_CONV / np.array(energies)
@@ -300,7 +309,7 @@ def diffBragg_forward(CRYSTAL, DETECTOR, BEAM, Famp, energies, fluxes,
     if delta_phi is not None:
         utils.update_SIM_with_gonio(S, delta_phi=delta_phi, num_phi_steps=num_phi_steps )
     S.D.add_diffBragg_spots_full()
-    if show_timings:
+    if show_timings or LOGGER.level <= 10:
         S.D.show_timings()
     t = time.time()
     data = S.D.raw_pixels_roi.as_numpy_array().reshape(img_shape)
diff --git a/simtbx/nanoBragg/__init__.py b/simtbx/nanoBragg/__init__.py
index 6e1101e3dd..eea55d7e1a 100644
--- a/simtbx/nanoBragg/__init__.py
+++ b/simtbx/nanoBragg/__init__.py
@@ -2,6 +2,7 @@
 import boost_adaptbx.boost.python as bp
 import cctbx.uctbx # possibly implicit
 ext = bp.import_ext("simtbx_nanoBragg_ext")
+from libtbx.phil import parse
 from scitbx.array_family import flex
 from simtbx_nanoBragg_ext import *
 from scitbx.matrix import col, sqr
@@ -14,7 +15,11 @@
 from dxtbx.model import CrystalFactory
 from dxtbx.model import BeamFactory
 from dxtbx.model import DetectorFactory
-from dxtbx.format import cbf_writer
+from dxtbx.format.Format import Format
+from dxtbx.format import cbf_writer, nxmx_writer
+from cctbx import sgtbx
+from cctbx.sgtbx.literal_description import literal_description
+import numpy as np
 
 
 @bp.inject_into(ext.nanoBragg)
@@ -31,14 +36,19 @@ def __getattr__(self,name):
       mset = set(crystal_symmetry=cs, anomalous_flag=True, indices=indices)
       return array(mset, data=data).set_observation_type_xray_amplitude()
 
-  def set_mosaic_blocks_sym(self, crystal, reference_symbol, orig_mos_domains):
+  def set_mosaic_blocks_sym(self, crystal, reference_symbol, orig_mos_domains, refining_eta=False):
     """
     hijack the mosaic blocks to symmetrize F_latt for P3/P6 space groups
     Will extend mosaic blocks by up to a factor of 3
-    :param crystal:  dxtbx crystal model in the original setting (reference)
+    :param crystal:  dxtbx crystal model, preferably in the original setting (reference)
     :param symbol: space group loopk symbol e.g. P6522
-    :param orig_mos_domains: int number of mosaic blocks before adding symmetry pairs
+    :param orig_mos_domains: int number of mosaic blocks before adding 3-fold mosaic blocks
+    :param refining_eta: bool, if using diffBragg to refine eta, the umat derivative mats should also be updated
     """
+    if refining_eta:
+      # TODO, fix this case, see diffBragg.cpp, method get_mosaic_blocks_prime
+      assert not self.has_anisotropic_spread
+
     sgi = sgtbx.space_group_info(reference_symbol)
     cb_op = sgi.change_of_basis_op_to_primitive_setting()
     # TODO? use internal nanoBragg parameters to get the p1 a,b,c vectors,
@@ -54,11 +64,20 @@ def set_mosaic_blocks_sym(self, crystal, reference_symbol, orig_mos_domains):
 
     # get the originally set mosaic blocks
     mos_blocks = self.get_mosaic_blocks()[:orig_mos_domains]
+    mos_blocks_prime = None
+    if refining_eta:
+      mos_blocks_prime = self.get_mosaic_blocks_prime()[:orig_mos_domains]
 
     # new list of mosaic blocks
     new_mos_blocks = flex.mat3_double()
     new_mos_blocks.extend(mos_blocks)
 
+    # eta deriv matrices
+    new_mos_blocks_prime = None
+    if mos_blocks_prime is not None:
+      new_mos_blocks_prime = flex.mat3_double()
+      new_mos_blocks_prime.extend(mos_blocks_prime)
+
     # loop over all laue group 3-fold operations
     ops = sgi.group().build_derived_laue_group().all_ops()
     for op in ops:
@@ -71,11 +90,16 @@ def set_mosaic_blocks_sym(self, crystal, reference_symbol, orig_mos_domains):
         Rcryst = A*R*A.inverse()
         # this will left-multiply the real space amatrix in nanoBragg / diffBragg
         new_mos_blocks.extend(flex.mat3_double([sqr(U)*Rcryst for U in mos_blocks]))
+        if new_mos_blocks_prime is not None:
+          #TODO does Umat_prime need to also be multiplied by Rcryst ?
+          new_mos_blocks_prime.extend(flex.mat3_double([sqr(U) for U in mos_blocks_prime]))
 
     self.set_mosaic_blocks(new_mos_blocks)
+    if new_mos_blocks_prime is not None:
+      self.set_mosaic_blocks_prime(new_mos_blocks_prime)
+
     # if using diffBragg, we must also call:
     if self.vectorize_umats is not None:
-      print("vectorizing Umats")
       self.vectorize_umats()
 
     # need to update nanoBragg mos_spread_deg so it actually runs through the mosaic domains
@@ -201,19 +225,19 @@ def detector(self):
     det_descr = {'panels':
                    [{'fast_axis': fdet,
                      'slow_axis': sdet,
-                     'gain': self.quantum_gain,
+                     'gain': 1, # this should always be 1 for simulations. If you ran add_noise then quantum gain was multiplied
                      'identifier': '',
                      'image_size': im_shape,
                      'mask': [],
-                     'material': '',  # TODO
+                     'material': 'Si',
                      'mu': 0.0,  # TODO
                      'name': 'Panel',
                      'origin': origin,
                      'pedestal': 0.0,
                      'pixel_size': (pixsize, pixsize),
                      'px_mm_strategy': {'type': 'SimplePxMmStrategy'},
-                     'raw_image_offset': (0, 0),  # TODO
-                     'thickness': 0.0,  # TODO
+                     'raw_image_offset': (0, 0),
+                     'thickness': 0.00001,  # TODO
                      'trusted_range': (-1e3, 1e10),  # TODO
                      'type': ''}]}
     detector = DetectorFactory.from_dict(det_descr)
@@ -221,10 +245,17 @@ def detector(self):
 
   @property
   def imageset(self):
-    format_class = FormatBraggInMemory(self.raw_pixels)
+    if self.cbf_int:
+      raw_pix = self.raw_pixels.as_numpy_array()
+      raw_pix = flex.int(raw_pix.astype(np.int32))
+    else:
+      raw_pix = self.raw_pixels
+
+    format_class = FormatBraggInMemory(raw_pix)
     reader = MemReaderNamedPath("virtual_Bragg_path", [format_class])
     reader.format_class = FormatBraggInMemory
-    imageset_data = ImageSetData(reader, None)
+    imageset_data = ImageSetData(reader, None, vendor="", params={'raw_pixels': raw_pix},
+                                 format=FormatBraggInMemory)
     imageset = ImageSet(imageset_data)
     imageset.set_beam(self.beam)
     imageset.set_detector(self.detector)
@@ -267,8 +298,46 @@ def as_explist(self, fname=None, toggle_conventions=False):
 
     return explist
 
-  def to_cbf(self, cbf_filename, toggle_conventions=False, intfile_scale=1.0):
+  def to_cbf(self, cbf_filename, toggle_conventions=False, intfile_scale=1.0, cbf_int=False):
     """write a CBF-format image file to disk from the raw pixel array
+    intfile_scale: multiplicative factor applied to raw pixels before output
+         intfile_scale > 0 : value of the multiplicative factor
+         intfile_scale = 1 (default): do not apply a factor
+         intfile_scale = 0 : compute a reasonable scale factor to set max pixel to 55000; given by get_intfile_scale()
+    cbf_int: boolean flag, write the cbf using 32-bit int precision
+    """
+    temp = self.cbf_int
+    self.cbf_int = cbf_int
+
+    if intfile_scale != 1.0:
+      cache_pixels = self.raw_pixels
+      if intfile_scale > 0: self.raw_pixels = self.raw_pixels * intfile_scale
+      else: self.raw_pixels = self.raw_pixels * self.get_intfile_scale()
+      # print("switch to scaled")
+
+    if toggle_conventions:
+      # switch to DIALS convention before writing CBF
+      CURRENT_CONV = self.beamcenter_convention
+      self.beamcenter_convention=DIALS
+
+    imgset = self.imageset
+    writer = cbf_writer.FullCBFWriter(imageset=imgset)
+    cbf = writer.get_cbf_handle(index=0, header_only=True)
+    data = imgset.get_raw_data(0)
+    writer.add_data_to_cbf(cbf, data=data)
+    writer.write_cbf(cbf_filename, cbf=cbf)
+
+    if toggle_conventions:
+      self.beamcenter_convention=CURRENT_CONV
+
+    if intfile_scale != 1.0:
+      self.raw_pixels = cache_pixels
+      # print("switch back to cached")
+
+    self.cbf_int = temp
+
+  def to_nexus_nxmx(self, nxmx_filename, toggle_conventions=False, intfile_scale=1.0):
+    """write a NeXus NXmx-format image file to disk from the raw pixel array
     intfile_scale: multiplicative factor applied to raw pixels before output
          intfile_scale > 0 : value of the multiplicative factor
          intfile_scale = 1 (default): do not apply a factor
@@ -285,8 +354,18 @@ def to_cbf(self, cbf_filename, toggle_conventions=False, intfile_scale=1.0):
       CURRENT_CONV = self.beamcenter_convention
       self.beamcenter_convention=DIALS
 
-    writer = cbf_writer.FullCBFWriter(imageset=self.imageset)
-    writer.write_cbf(cbf_filename, index=0)
+    params = nxmx_writer.phil_scope.fetch(parse("""
+    output_file=%s
+    nexus_details {
+      instrument_name=nanoBragg
+      source_name=nanoBragg
+      start_time=NA
+      end_time_estimated=NA
+      sample_name=nanoBragg
+    }
+    """%nxmx_filename)).extract()
+    writer = nxmx_writer.NXmxWriter(params)
+    writer(imageset=self.imageset)
 
     if toggle_conventions:
       self.beamcenter_convention=CURRENT_CONV
@@ -295,6 +374,22 @@ def to_cbf(self, cbf_filename, toggle_conventions=False, intfile_scale=1.0):
       self.raw_pixels = cache_pixels
       # print("switch back to cached")
 
+def nexus_factory(nxmx_filename):
+    params = nxmx_writer.phil_scope.fetch(parse("""
+    output_file=%s
+    nexus_details {
+      instrument_name=nanoBragg
+      source_name=nanoBragg
+      start_time=NA
+      end_time_estimated=NA
+      sample_name=nanoBragg
+    }
+    dtype=int32
+    """%nxmx_filename)).extract()
+    writer = nxmx_writer.NXmxWriter(params)
+    return writer
+
+
 def make_imageset(data, beam, detector):
   format_class = FormatBraggInMemoryMultiPanel(data)
   reader = MemReaderNamedPath("virtual_Bragg_path", [format_class])
@@ -332,7 +427,7 @@ def get_mask(self, goniometer=None):
     """dummie place holder for mask, consider using internal nanoBragg mask"""
     return self.mask
 
-class FormatBraggInMemory:
+class FormatBraggInMemory(Format):
 
   def __init__(self, raw_pixels):
     self.raw_pixels = raw_pixels
@@ -358,6 +453,10 @@ def get_mask(self, goniometer=None):
     """dummie place holder for mask, consider using internal nanoBragg mask"""
     return self.mask,
 
+  @classmethod
+  def get_instance(Class, filename, **kwargs):
+    return Class(raw_pixels = kwargs.pop('raw_pixels'), **kwargs)
+
   #def paths(self):
   #  return ["InMemoryBraggPath"]  # TODO: CBFLib complains if no datablock path provided which comes from path
 
diff --git a/simtbx/nanoBragg/anisotropic_mosaicity.py b/simtbx/nanoBragg/anisotropic_mosaicity.py
index edf3d20643..3934117b47 100644
--- a/simtbx/nanoBragg/anisotropic_mosaicity.py
+++ b/simtbx/nanoBragg/anisotropic_mosaicity.py
@@ -72,8 +72,8 @@ def _compute(rot_ax, ang_idx, eta_eff, Cvec, derivs=None, second_derivs=None):
             dU_d_theta = rot_ax.axis_and_angle_as_r3_derivative_wrt_angle(rot_sign*rot_ang, deg=False) # 1st deriv
             d2U_d_theta2 = rot_ax.axis_and_angle_as_r3_derivative_wrt_angle(rot_sign*rot_ang, deg=False, second_order=True)  # second deriv
             for d, d2 in zip(d_theta_d_eta, dsquared_theta_d_eta_squared):
-                dU_d_eta = rot_sign*dU_d_theta*d
-                d2U_d_eta2 = d2U_d_theta2*(d**2) + dU_d_theta*d2
+                dU_d_eta = dU_d_theta*(rot_sign*d)
+                d2U_d_eta2 = d2U_d_theta2*(d**2) + dU_d_theta*(rot_sign*d2)
                 Uprimes.append(dU_d_eta)
                 Udblprimes.append(d2U_d_eta2)
 
diff --git a/simtbx/nanoBragg/nanoBragg.cpp b/simtbx/nanoBragg/nanoBragg.cpp
index 2c4948c4d9..14ba8b26b8 100644
--- a/simtbx/nanoBragg/nanoBragg.cpp
+++ b/simtbx/nanoBragg/nanoBragg.cpp
@@ -216,6 +216,9 @@ nanoBragg::init_defaults()
         exit(9);
     };
 
+    /* write cbf files in int precision? */
+    cbf_int = false;
+
     /* optional file stuff, to be removed eventually? */
     matfilename = NULL;
     hklfilename = NULL;
diff --git a/simtbx/nanoBragg/nanoBragg.h b/simtbx/nanoBragg/nanoBragg.h
index 00963071c5..bd1c1a5cc6 100644
--- a/simtbx/nanoBragg/nanoBragg.h
+++ b/simtbx/nanoBragg/nanoBragg.h
@@ -366,6 +366,7 @@ class nanoBragg {
     af::flex_double raw_pixels;
     unsigned short int *intimage;
     unsigned char *pgmimage;
+    bool cbf_int; // if saving the cbf file raw pixels will be converted to int
 //    char *byte_order; // = get_byte_order();
     /* optional input image to extract background? */
 //    SMVinfo imginfile;
diff --git a/simtbx/nanoBragg/nanoBragg_ext.cpp b/simtbx/nanoBragg/nanoBragg_ext.cpp
index 197170d696..269c11634f 100644
--- a/simtbx/nanoBragg/nanoBragg_ext.cpp
+++ b/simtbx/nanoBragg/nanoBragg_ext.cpp
@@ -1915,6 +1915,11 @@ printf("DEBUG: pythony_stolFbg[1]=(%g,%g)\n",nanoBragg.pythony_stolFbg[1][0],nan
                      make_setter(&nanoBragg::raw_pixels,dcp()),
                      "2D flex array representing floating-point pixel values, this is expected photons before you call add_noise(), which converts it into detector pixel values, or ADU")
 
+     .add_property("cbf_int",
+                     make_getter(&nanoBragg::cbf_int,rbv()),
+                     make_setter(&nanoBragg::cbf_int,dcp()),
+                     "Write the cbf file using to_cbf with int32 precision")
+
       /* print to screen a summary of all initialized parameters */
       .def("show_params",&nanoBragg::show_params,
        "print out all simulation parameters, just like the standalone program")
diff --git a/simtbx/nanoBragg/sim_data.py b/simtbx/nanoBragg/sim_data.py
index ac7d5cc56b..f40c2581da 100644
--- a/simtbx/nanoBragg/sim_data.py
+++ b/simtbx/nanoBragg/sim_data.py
@@ -360,6 +360,8 @@ def get_detector_corner_res(self):
 
   def update_Fhkl_tuple(self):
     if self.crystal.miller_array is not None:
+      if np.all(self.crystal.miller_array.data().as_numpy_array()==0):
+        raise ValueError("Seems all miller indices are 0")
       d_max, _ = self.crystal.miller_array.resolution_range()
       d_min = self.get_detector_corner_res()
       ma_on_detector = self.crystal.miller_array.resolution_filter(d_min=d_min, d_max=d_max)
@@ -470,8 +472,12 @@ def _init_diffBragg_umats(self):
     if self.crystal.anisotropic_mos_spread_deg is not None:
       if tuple(self.crystal.anisotropic_mos_spread_deg) == (0,0,0) and self.crystal.n_mos_domains != 1:
         raise ValueError("If more than 1 mosaic domain are passed, must set a positive value for anisotropic_mos_spread")
-      self.D.has_anisotropic_mosaic_spread = True
-      mosaicity = self.crystal.anisotropic_mos_spread_deg
+      if len(set(self.crystal.anisotropic_mos_spread_deg))==1:
+        self.D.has_anisotropic_mosaic_spread = False
+        mosaicity = self.crystal.anisotropic_mos_spread_deg[0]
+      else:
+        self.D.has_anisotropic_mosaic_spread = True
+        mosaicity = self.crystal.anisotropic_mos_spread_deg
     else:
       self.D.has_anisotropic_mosaic_spread = False
       mosaicity = self.crystal.mos_spread_deg
diff --git a/simtbx/nanoBragg/tst_gauss_argchk.py b/simtbx/nanoBragg/tst_gauss_argchk.py
index 87368ac24f..91ba3043ff 100644
--- a/simtbx/nanoBragg/tst_gauss_argchk.py
+++ b/simtbx/nanoBragg/tst_gauss_argchk.py
@@ -231,6 +231,7 @@ def simple_monochromatic_case(bragg_engine, BEAM, DETECTOR, CRYSTAL, SF_model, a
   SIM.to_smv_format(fileout="test_full_001.img", intfile_scale=output_scale)
   assert approx_equal(SIM.raw_pixels, SIM2.raw_pixels)
   SIM.to_cbf("test_full_001.cbf", intfile_scale=output_scale)
+  SIM.to_nexus_nxmx("test_full_001.h5", intfile_scale=output_scale)
 
   if runmode=="GPU":
     bragg_engine = nanoBragg.add_nanoBragg_spots_cuda
diff --git a/simtbx/run_tests.py b/simtbx/run_tests.py
index e333d6ca87..7add96ae7c 100644
--- a/simtbx/run_tests.py
+++ b/simtbx/run_tests.py
@@ -33,6 +33,7 @@
     ["$D/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py", "--idx 1"],
     ["$D/diffBragg/tests/tst_diffBragg_ncells_property_anisotropic.py", "--idx 2"],
     ["$D/diffBragg/tests/tst_diffBragg_unitcell_property.py", "--crystalsystem tetragonal" ],
+    ["$D/diffBragg/tests/tst_diffBragg_unitcell_property.py", "--crystalsystem hexagonal" ],
     ["$D/diffBragg/tests/tst_diffBragg_unitcell_property.py", "--crystalsystem monoclinic" ],
     ["$D/diffBragg/tests/tst_diffBragg_lambda_coefficients.py", "--idx 0"],
     ["$D/diffBragg/tests/tst_diffBragg_lambda_coefficients.py", "--idx 1"],
diff --git a/xfel/merging/application/errors/error_modifier_ev11.py b/xfel/merging/application/errors/error_modifier_ev11.py
index 004c9539d8..cf72dbb1c6 100644
--- a/xfel/merging/application/errors/error_modifier_ev11.py
+++ b/xfel/merging/application/errors/error_modifier_ev11.py
@@ -325,7 +325,7 @@ def calculate_initial_ev11_parameters(self):
       # Calculate initial EV11 parameters
       self.sfac = 1/slope
       self.sadd = offset
-      self.sb = math.sqrt(self.sadd)
+      self.sb = math.sqrt(self.sadd) if self.sadd > 0 else 0
 
       '''
       if True: