perf: reduce threading overhead (#50)

This redesign increases the size of each thread's task to be a 2D slice instead of a single row. This vastly reduces the ratio of threading overhead to actual work and makes even single thread mode faster. Threading is overall more effective. Thanks @Pavlik1400 for the contribution!
seung-lab · Feb 18, 2024 · 12c9c98 · 12c9c98
1 parent dbe109a
commit 12c9c98
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 61 deletions.
diff --git a/cpp/edt.hpp b/cpp/edt.hpp
@@ -407,45 +407,45 @@ float* _edt3dsq(
   ThreadPool pool(parallel);
 
   for (size_t z = 0; z < sz; z++) {
-    for (size_t y = 0; y < sy; y++) {
-      pool.enqueue([labels, y, z, sx, sxy, wx, workspace, black_border](){
+    pool.enqueue([labels, sy, z, sx, sxy, wx, workspace, black_border](){
+      for (size_t y = 0; y < sy; y++) {
         squared_edt_1d_multi_seg<T>(
           (labels + sx * y + sxy * z), 
           (workspace + sx * y + sxy * z), 
           sx, 1, wx, black_border
         ); 
-      });
-    }
+      }
+    });
   }
 
   pool.join();
   pool.start(parallel);
 
   for (size_t z = 0; z < sz; z++) {
-    for (size_t x = 0; x < sx; x++) {
-      pool.enqueue([labels, x, sxy, z, workspace, sx, sy, wy, black_border](){
+    pool.enqueue([labels, sxy, z, workspace, sx, sy, wy, black_border](){
+      for (size_t x = 0; x < sx; x++) {
         squared_edt_1d_parabolic_multi_seg<T>(
           (labels + x + sxy * z),
           (workspace + x + sxy * z), 
           sy, sx, wy, black_border
         );
-      });
-    }
+      }
+    });
   }
 
   pool.join();
   pool.start(parallel);
 
   for (size_t y = 0; y < sy; y++) {
-    for (size_t x = 0; x < sx; x++) {
-      pool.enqueue([labels, x, sx, y, workspace, sz, sxy, wz, black_border](){
+    pool.enqueue([labels, sx, y, workspace, sz, sxy, wz, black_border](){
+      for (size_t x = 0; x < sx; x++) {
         squared_edt_1d_parabolic_multi_seg<T>(
           (labels + x + sx * y), 
           (workspace + x + sx * y), 
           sz, sxy, wz, black_border
         );
-      });
-    }
+      }
+    });
   }
 
   pool.join();
@@ -475,47 +475,48 @@ float* _binary_edt3dsq(
   ThreadPool pool(parallel);
 
   for (z = 0; z < sz; z++) {
-    for (y = 0; y < sy; y++) { 
-      pool.enqueue([binaryimg, sx, y, sxy, z, workspace, wx, black_border](){
+    pool.enqueue([binaryimg, sy, sx, sxy, z, workspace, wx, black_border](){
+      for (size_t y = 0; y < sy; y++) { 
         squared_edt_1d_multi_seg<T>(
           (binaryimg + sx * y + sxy * z), 
           (workspace + sx * y + sxy * z), 
           sx, 1, wx, black_border
         ); 
-      });
-    }
+      }
+    });
   }
 
   pool.join();
   pool.start(parallel);
 
   size_t offset;
   for (z = 0; z < sz; z++) {
-    for (x = 0; x < sx; x++) {
-      offset = x + sxy * z;
-      for (y = 0; y < sy; y++) {
-        if (workspace[offset + sx*y]) {
-          break;
+    pool.enqueue([sx, sy, sxy, z, workspace, wy, black_border, offset](){
+      for (size_t x = 0; x < sx; x++) {
+        offset = x + sxy * z;
+        size_t y;
+        for (y = 0; y < sy; y++) {
+          if (workspace[offset + sx*y]) {
+            break;
+          }
         }
-      }
 
-      pool.enqueue([sx, sy, y, workspace, wy, black_border, offset](){
         _squared_edt_1d_parabolic(
           (workspace + offset + sx * y), 
           sy - y, sx, wy, 
           black_border || (y > 0), black_border
         );
-      });
-    }
+      }
+    });
   }
 
   pool.join();
   pool.start(parallel);
 
   for (y = 0; y < sy; y++) {
-    for (x = 0; x < sx; x++) {
-      offset = x + sx * y;
-      pool.enqueue([sz, sxy, workspace, wz, black_border, offset](){
+    pool.enqueue([y, sx, sz, sxy, workspace, wz, black_border, offset](){
+      for (size_t x = 0; x < sx; x++) {
+        offset = x + sx * y;
         size_t z = 0;
         for (z = 0; z < sz; z++) {
           if (workspace[offset + sxy*z]) {
@@ -527,8 +528,8 @@ float* _binary_edt3dsq(
           sz - z, sxy, wz, 
           black_border || (z > 0), black_border
         );
-      });
-    }
+      }
+    });
   }
 
   pool.join();

diff --git a/cpp/test.cpp b/cpp/test.cpp
@@ -1,6 +1,7 @@
 #include "edt.hpp"
 
 #include <chrono>
+#include <thread>
 
 using namespace pyedt;
 using namespace edt;
@@ -66,7 +67,7 @@ void test2d(int n) {
   delete [] input;
 }
 
-double test3d(int n) {
+void test3d(int n) {
   int N = n*n*n;
   int* input = new int[N]();
 
@@ -80,33 +81,24 @@ double test3d(int n) {
 
   input[N / 2] = 0;
 
-  auto begin = std::chrono::high_resolution_clock::now();
+  printf("Warm up");
+  float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true); // Warmp up. 
+  const auto processor_count = std::thread::hardware_concurrency();
+  for (int nw = 1; nw <= processor_count; ++nw) {
+    auto begin = std::chrono::high_resolution_clock::now();
 
-  float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true);
+    float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true, nw);
 
-  auto end = std::chrono::high_resolution_clock::now();
+    auto end = std::chrono::high_resolution_clock::now();
 
-  if (n < 20) {
-    for (int i = 0; i < n*n*n; i++) {
-      if (i % n == 0 && i > 0) {
-        printf("\n");
-      }
-      if (i % (n*n) == 0 && i > 0) {
-        printf("\n");
-      }
-      printf("%.2f, ", dest[i]);
-    }
+    delete []dest;
 
-    printf("\n\n\n");
+    auto duration =
+          std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
+              .count();
+    auto secs = static_cast<double>(duration) / 1000. / 1000.;
+    printf("Took %.3f sec. with nw=%d\n", secs, nw);
   }
-
-  delete []dest;
-
-  auto duration =
-        std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
-            .count();
-  auto secs = static_cast<double>(duration) / 1000. / 1000.;
-  return secs;
 }
 
 void print(int *in, float* f, float* ans, int n) {
@@ -223,12 +215,5 @@ void test_two_d_parabola () {
 }
 
 int main () {
-  // try {
-  //  test_two_d_parabola();
-  // }
-  // catch (char const *c) {
-  //  printf("%s", c);
-  // }
-  auto secs = test3d(512);
-  printf("Took %.3f sec.\n", secs);
+  test3d(512);
 }