diff --git a/shared/libebm/bridge/bridge.h b/shared/libebm/bridge/bridge.h
index ed6293c17..35a7f2b65 100644
--- a/shared/libebm/bridge/bridge.h
+++ b/shared/libebm/bridge/bridge.h
@@ -26,10 +26,10 @@ static_assert(sizeof(UIntSmall) < sizeof(UIntBig), "UIntBig must be able to cont
 static_assert(sizeof(FloatSmall) < sizeof(FloatBig), "FloatBig must be able to contain FloatSmall");
 
 
-// This is large enough that with 1024 bins using AVX512F that the entire 16 SIMD tensors can fit into memory
-// It is larger than most L1 caches, so some of it will need to go into L2, but even with a lot of threads which 
-// share the L2/L3 cache, I do not think it should exceed the full L2/L3 caches, so it shouldn't use main memory.
-#define PARALLEL_BINS_BYTES_MAX (STATIC_CAST(size_t, (131072)))
+// from benchmarking, it seems that things are faster if our data can fit into L1 data cache, but get
+// slower if we depend on L2, so keep it under typical L1 data cache sizes, which for smaller mobile
+// laptops and ARM is 16kB
+#define PARALLEL_BINS_BYTES_MAX (STATIC_CAST(size_t, (16384)))
 
 struct ApplyUpdateBridge {
    size_t m_cScores;