Skip to content

Commit

Permalink
perf: reduce threading overhead (#50)
Browse files Browse the repository at this point in the history
This redesign increases the size of each thread's task to be a 2D slice instead of a single row. This vastly reduces the ratio of threading overhead to actual work and makes even single thread mode faster. Threading is overall more effective. Thanks @Pavlik1400 for the contribution!
  • Loading branch information
Pavlik1400 authored Feb 18, 2024
1 parent dbe109a commit 12c9c98
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 61 deletions.
61 changes: 31 additions & 30 deletions cpp/edt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,45 +407,45 @@ float* _edt3dsq(
ThreadPool pool(parallel);

for (size_t z = 0; z < sz; z++) {
for (size_t y = 0; y < sy; y++) {
pool.enqueue([labels, y, z, sx, sxy, wx, workspace, black_border](){
pool.enqueue([labels, sy, z, sx, sxy, wx, workspace, black_border](){
for (size_t y = 0; y < sy; y++) {
squared_edt_1d_multi_seg<T>(
(labels + sx * y + sxy * z),
(workspace + sx * y + sxy * z),
sx, 1, wx, black_border
);
});
}
}
});
}

pool.join();
pool.start(parallel);

for (size_t z = 0; z < sz; z++) {
for (size_t x = 0; x < sx; x++) {
pool.enqueue([labels, x, sxy, z, workspace, sx, sy, wy, black_border](){
pool.enqueue([labels, sxy, z, workspace, sx, sy, wy, black_border](){
for (size_t x = 0; x < sx; x++) {
squared_edt_1d_parabolic_multi_seg<T>(
(labels + x + sxy * z),
(workspace + x + sxy * z),
sy, sx, wy, black_border
);
});
}
}
});
}

pool.join();
pool.start(parallel);

for (size_t y = 0; y < sy; y++) {
for (size_t x = 0; x < sx; x++) {
pool.enqueue([labels, x, sx, y, workspace, sz, sxy, wz, black_border](){
pool.enqueue([labels, sx, y, workspace, sz, sxy, wz, black_border](){
for (size_t x = 0; x < sx; x++) {
squared_edt_1d_parabolic_multi_seg<T>(
(labels + x + sx * y),
(workspace + x + sx * y),
sz, sxy, wz, black_border
);
});
}
}
});
}

pool.join();
Expand Down Expand Up @@ -475,47 +475,48 @@ float* _binary_edt3dsq(
ThreadPool pool(parallel);

for (z = 0; z < sz; z++) {
for (y = 0; y < sy; y++) {
pool.enqueue([binaryimg, sx, y, sxy, z, workspace, wx, black_border](){
pool.enqueue([binaryimg, sy, sx, sxy, z, workspace, wx, black_border](){
for (size_t y = 0; y < sy; y++) {
squared_edt_1d_multi_seg<T>(
(binaryimg + sx * y + sxy * z),
(workspace + sx * y + sxy * z),
sx, 1, wx, black_border
);
});
}
}
});
}

pool.join();
pool.start(parallel);

size_t offset;
for (z = 0; z < sz; z++) {
for (x = 0; x < sx; x++) {
offset = x + sxy * z;
for (y = 0; y < sy; y++) {
if (workspace[offset + sx*y]) {
break;
pool.enqueue([sx, sy, sxy, z, workspace, wy, black_border, offset](){
for (size_t x = 0; x < sx; x++) {
offset = x + sxy * z;
size_t y;
for (y = 0; y < sy; y++) {
if (workspace[offset + sx*y]) {
break;
}
}
}

pool.enqueue([sx, sy, y, workspace, wy, black_border, offset](){
_squared_edt_1d_parabolic(
(workspace + offset + sx * y),
sy - y, sx, wy,
black_border || (y > 0), black_border
);
});
}
}
});
}

pool.join();
pool.start(parallel);

for (y = 0; y < sy; y++) {
for (x = 0; x < sx; x++) {
offset = x + sx * y;
pool.enqueue([sz, sxy, workspace, wz, black_border, offset](){
pool.enqueue([y, sx, sz, sxy, workspace, wz, black_border, offset](){
for (size_t x = 0; x < sx; x++) {
offset = x + sx * y;
size_t z = 0;
for (z = 0; z < sz; z++) {
if (workspace[offset + sxy*z]) {
Expand All @@ -527,8 +528,8 @@ float* _binary_edt3dsq(
sz - z, sxy, wz,
black_border || (z > 0), black_border
);
});
}
}
});
}

pool.join();
Expand Down
47 changes: 16 additions & 31 deletions cpp/test.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "edt.hpp"

#include <chrono>
#include <thread>

using namespace pyedt;
using namespace edt;
Expand Down Expand Up @@ -66,7 +67,7 @@ void test2d(int n) {
delete [] input;
}

double test3d(int n) {
void test3d(int n) {
int N = n*n*n;
int* input = new int[N]();

Expand All @@ -80,33 +81,24 @@ double test3d(int n) {

input[N / 2] = 0;

auto begin = std::chrono::high_resolution_clock::now();
printf("Warm up");
float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true); // Warmp up.
const auto processor_count = std::thread::hardware_concurrency();
for (int nw = 1; nw <= processor_count; ++nw) {
auto begin = std::chrono::high_resolution_clock::now();

float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true);
float* dest = edtsq<int>(input, n,n,n, 1.,1.,1., true, nw);

auto end = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();

if (n < 20) {
for (int i = 0; i < n*n*n; i++) {
if (i % n == 0 && i > 0) {
printf("\n");
}
if (i % (n*n) == 0 && i > 0) {
printf("\n");
}
printf("%.2f, ", dest[i]);
}
delete []dest;

printf("\n\n\n");
auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
.count();
auto secs = static_cast<double>(duration) / 1000. / 1000.;
printf("Took %.3f sec. with nw=%d\n", secs, nw);
}

delete []dest;

auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
.count();
auto secs = static_cast<double>(duration) / 1000. / 1000.;
return secs;
}

void print(int *in, float* f, float* ans, int n) {
Expand Down Expand Up @@ -223,12 +215,5 @@ void test_two_d_parabola () {
}

int main () {
// try {
// test_two_d_parabola();
// }
// catch (char const *c) {
// printf("%s", c);
// }
auto secs = test3d(512);
printf("Took %.3f sec.\n", secs);
test3d(512);
}

0 comments on commit 12c9c98

Please sign in to comment.