diff --git a/abacusnbody/hod/prepare_sim.py b/abacusnbody/hod/prepare_sim.py
index 3ae7fe02..51171338 100644
--- a/abacusnbody/hod/prepare_sim.py
+++ b/abacusnbody/hod/prepare_sim.py
@@ -1094,9 +1094,14 @@ def main(
     else:
         shearmark = None
     # N_dim = config['HOD_params']['Ndim']
-    nthread = int(
-        np.floor(multiprocessing.cpu_count() / config['prepare_sim']['Nparallel_load'])
-    )
+    nthread = config['prepare_sim'].get('Nthread_per_load', 'auto')
+    if nthread == 'auto':
+        nthread = (
+            len(os.sched_getaffinity(0)) // config['prepare_sim']['Nparallel_load']
+        )
+        print(f'prepare_sim inferred Nthread_per_load = {nthread}')
+    else:
+        nthread = int(nthread)
 
     p = multiprocessing.Pool(config['prepare_sim']['Nparallel_load'])
     p.starmap(
diff --git a/scripts/hod/config/abacus_hod.yaml b/scripts/hod/config/abacus_hod.yaml
index 68eb9442..d94f1027 100644
--- a/scripts/hod/config/abacus_hod.yaml
+++ b/scripts/hod/config/abacus_hod.yaml
@@ -13,7 +13,8 @@ sim_params:
     cleaned_halos: True                                                     # load cleaned halos?
 
 prepare_sim:
-    Nparallel_load: 5                                                          # number of thread for organizing simulation outputs (prepare_sim)
+    Nparallel_load: 5                                          # number of processes. peak memory usage will increase by this factor.
+    Nthread_per_load: 'auto'                                   # number of threads per process (auto uses the affinity mask)
 
 # HOD parameters
 HOD_params:
diff --git a/scripts/hod/config/lc_hod.yaml b/scripts/hod/config/lc_hod.yaml
index d20487ba..7c88dc49 100644
--- a/scripts/hod/config/lc_hod.yaml
+++ b/scripts/hod/config/lc_hod.yaml
@@ -14,6 +14,7 @@ sim_params:
 
 prepare_sim:
     Nparallel_load: 1 # not sure if this makes a difference since we have a single slab
+    Nthread_per_load: 'auto'
 
 # HOD parameters
 HOD_params: