diff --git a/shared/libebm/bridge/bridge.h b/shared/libebm/bridge/bridge.h index ed6293c17..35a7f2b65 100644 --- a/shared/libebm/bridge/bridge.h +++ b/shared/libebm/bridge/bridge.h @@ -26,10 +26,10 @@ static_assert(sizeof(UIntSmall) < sizeof(UIntBig), "UIntBig must be able to cont static_assert(sizeof(FloatSmall) < sizeof(FloatBig), "FloatBig must be able to contain FloatSmall"); -// This is large enough that with 1024 bins using AVX512F that the entire 16 SIMD tensors can fit into memory -// It is larger than most L1 caches, so some of it will need to go into L2, but even with a lot of threads which -// share the L2/L3 cache, I do not think it should exceed the full L2/L3 caches, so it shouldn't use main memory. -#define PARALLEL_BINS_BYTES_MAX (STATIC_CAST(size_t, (131072))) +// from benchmarking, it seems that things are faster if our data can fit into L1 data cache, but get +// slower if we depend on L2, so keep it under typical L1 data cache sizes, which for smaller mobile +// laptops and ARM is 16kB +#define PARALLEL_BINS_BYTES_MAX (STATIC_CAST(size_t, (16384))) struct ApplyUpdateBridge { size_t m_cScores;