diff --git a/ANSWER.md b/ANSWER.md index 83349d8..46e8375 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,45 +1,134 @@ +# 环境 + +entry | info +--- | --- +OS | WSL2 of Ubuntu20.04 +Architecture | x86_64 +CPU(s) | 8 +Thread(s) per core | 2 +L1d cache | 128 KiB +L1i cache | 128 KiB +L2 cache | 1 MiB +L3 cache | 6 MiB + # 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 0.0036149s +matrix_randomize: 0.0073586s +matrix_transpose: 0.0058029s +matrix_multiply: 1.57465s +matrix_multiply: 2.1565s +matrix_RtAR: 3.73763s +matrix_trace: 1.49e-05s +1.75932e+08 +test_func: 3.75655s +t=1: n=928 +matrix_randomize: 0.0038179s +matrix_randomize: 0.0058834s +matrix_transpose: 0.0037703s +matrix_multiply: 0.463579s +matrix_multiply: 0.385973s +matrix_RtAR: 0.854033s +matrix_trace: 2.59e-05s +1.00156e+08 +test_func: 0.86811s +t=2: n=1024 +matrix_randomize: 0.0049764s +matrix_randomize: 0.0043571s +matrix_transpose: 0.0112257s +matrix_multiply: 1.86208s +matrix_multiply: 1.89948s +matrix_RtAR: 3.77345s +matrix_trace: 1.25e-05s +1.34324e+08 +test_func: 3.78877s +t=3: n=1056 +matrix_randomize: 0.0032734s +matrix_randomize: 0.0041083s +matrix_transpose: 0.0088871s +matrix_multiply: 1.37759s +matrix_multiply: 1.42704s +matrix_RtAR: 2.81431s +matrix_trace: 1.3e-05s +1.47405e+08 +test_func: 2.82567s +overall: 11.2438s ``` # 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 0.0044706s +matrix_randomize: 0.0009012s +matrix_transpose: 0.004934s +matrix_multiply: 1.09293s +matrix_multiply: 1.22578s +matrix_RtAR: 2.32385s +matrix_trace: 0.0004732s +1.75932e+08 +test_func: 2.34133s +t=1: n=928 +matrix_randomize: 0.0010455s +matrix_randomize: 0.0007095s +matrix_transpose: 0.0026342s +matrix_multiply: 0.274808s +matrix_multiply: 0.29029s +matrix_RtAR: 0.56793s +matrix_trace: 0.0004422s +1.00156e+08 +test_func: 0.573571s +t=2: n=1024 +matrix_randomize: 0.0029974s +matrix_randomize: 0.0044855s +matrix_transpose: 0.001516s +matrix_multiply: 1.6364s +matrix_multiply: 1.66931s +matrix_RtAR: 3.30743s +matrix_trace: 0.0011572s +1.34324e+08 +test_func: 3.32015s +t=3: n=1056 +matrix_randomize: 0.0014983s +matrix_randomize:0.0014983s +matrix_transpose: 0.0028232s +matrix_multiply: 0.700534s +matrix_multiply: 0.729549s +matrix_RtAR: 1.43312s +matrix_trace: 0.0017723s +1.47405e+08 +test_func: 1.44506s +overall: 7.68174s ``` # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x - -> 如果记录了多种优化方法,可以做表格比较 +function | time(pre) | time(optimized) |speedup +--- | --- | --- | --- +matrix_randomize | 0.00467375s | 0.0022007875s | 2.124x +matrix_transpose | 0.00742150s | 0.00297685s | 2.493x +matrix_multiply | 1.3933615s | 0.9524501s | 1.463x +matrix_RtAR | 2.79485575s | 1.90808250s | 1.465x # 优化方法 -下面这些函数你是如何优化的?是什么思路?用了老师上课的哪个知识点? - > matrix_randomize -请回答。 +改成 YX 序遍历;改用 _mm256_stream_ps 直写内存。 > matrix_transpose -请回答。 +用 loop tiling + morton ordering 的方式优化,分块时保证 `BLOCKSIZE ^ 2` 小于L1缓存大小,利用 TBB 的 `tbb::simple_partitioner` 进行 morton ordering 遍历。 > matrix_multiply -请回答。 +同 `matrix_transpose` 的优化,但用一个临时变量额外存累加和,最后向 `out(x, y)` 写结果。 > matrix_RtAR -请回答。 +将两个临时变量局部静态化,即手动池化,防止重复分配内存。p.s. 虽然没有多线程,但还是用课上的 `thread_local` 修饰,保证线程安全。 # 我的创新点 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..3cd661c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,8 @@ add_executable(main main.cpp) find_package(OpenMP REQUIRED) target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) -#find_package(TBB REQUIRED) -#target_link_libraries(main PUBLIC TBB::tbb) +find_package(TBB REQUIRED) +target_link_libraries(main PUBLIC TBB::tbb) if (MSVC) target_compile_options(main PUBLIC /fp:fast /arch:AVX) diff --git a/main.cpp b/main.cpp index d5af053..9047381 100644 --- a/main.cpp +++ b/main.cpp @@ -8,8 +8,12 @@ // 并行可以用 OpenMP 也可以用 TBB #include -//#include // _mm 系列指令都来自这个头文件 +#include +#include // _mm 系列指令都来自这个头文件 //#include // 如果上面那个不行,试试这个 +#include +#include +#include #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" @@ -24,11 +28,22 @@ static void matrix_randomize(Matrix &out) { size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 + // 答:不够高效是因为内部循环遍历 y 导致访存不是连续的,而是跳跃的,降低访存效率。 + // 可以通过改变循环的轴,并采用直写进行优化。 #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; + for (int y = 0; y < ny; ++y) { + for (int x = 0; x < nx; x += 8) { + __m256 res = { + wangsrng(x, y).next_float(), + wangsrng(x+1, y).next_float(), + wangsrng(x+2, y).next_float(), + wangsrng(x+3, y).next_float(), + wangsrng(x+4, y).next_float(), + wangsrng(x+5, y).next_float(), + wangsrng(x+6, y).next_float(), + wangsrng(x+7, y).next_float() + }; + _mm256_stream_ps(&out(x, y), res); } } TOCK(matrix_randomize); @@ -41,12 +56,20 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); - } - } + // 答:内部循环访存 out(y, x) 是连续的,但访问 in(x, y) 不是。 + // 可以用 loop tiling + morton ordering 的方式优化 + + const int BLOCK_SIZE = 32; // 32 * 32 * 4B = 4KB = 1 page and n mod 32 = 0 + tbb::parallel_for(tbb::blocked_range2d(0, nx, BLOCK_SIZE, 0, ny, BLOCK_SIZE), + [&in, &out](const tbb::blocked_range2d &r) { + for (int x = r.rows().begin(); x != r.rows().end(); ++x) { + for (int y = r.cols().begin(); y != r.cols().end(); ++y) { + out(y, x) = in(x, y); + } + } + }, + tbb::simple_partitioner{} + ); TOCK(matrix_transpose); } @@ -62,15 +85,24 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { out.reshape(nx, ny); // 这个循环为什么不够高效?如何优化? 15 分 -#pragma omp parallel for collapse(2) - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - out(x, y) = 0; // 有没有必要手动初始化? 5 分 - for (int t = 0; t < nt; t++) { - out(x, y) += lhs(x, t) * rhs(t, y); + // 答:最内部循环访存 rhs(t, y) 是连续的,但访问 out(x, y) 与 lhs(x, t) 不是。 + // 可以用 loop tiling + morton ordering 的方式优化 + const int BLOCK_SIZE = 32; // 32 * 32 * 4B = 4KB = 1 page and n mod 32 = 0 + tbb::parallel_for(tbb::blocked_range2d(0, nx, BLOCK_SIZE, 0, ny, BLOCK_SIZE), + [&out, &lhs, &rhs, nt](const tbb::blocked_range2d &r) { + for (int x = r.rows().begin(); x != r.rows().end(); ++x) { + for (int y = r.cols().begin(); y != r.cols().end(); ++y) { + float tmp = 0.0f; + for (int t = 0; t < nt; ++t) { + tmp += lhs(x, t) * rhs(t, y); + } + out(x, y) = tmp; // 有没有必要手动初始化? 5 分 + // 没必要初始化,也可以用临时变量记录和。 + } } - } - } + }, + tbb::simple_partitioner{} + ); TOCK(matrix_multiply); } @@ -78,7 +110,9 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) { TICK(matrix_RtAR); // 这两个是临时变量,有什么可以优化的? 5 分 - Matrix Rt, RtA; + // 可以将其局部静态化,以防重复初始化;tread_local 保证多线程安全。 + static thread_local Matrix Rt, RtA; + matrix_transpose(Rt, R); matrix_multiply(RtA, Rt, A); matrix_multiply(RtAR, RtA, R);