parallel101 · QifanWang · Dec 4, 2022
diff --git a/ANSWER.md b/ANSWER.md
@@ -1,45 +1,134 @@
+# 环境
+
+entry | info
+--- | ---
+OS | WSL2 of Ubuntu20.04
+Architecture |                    x86_64
+CPU(s) |                          8
+Thread(s) per core |              2
+L1d cache |                       128 KiB
+L1i cache |                       128 KiB
+L2 cache |                        1 MiB
+L3 cache |                        6 MiB
+
 # 改进前
 
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+t=0: n=1120
+matrix_randomize: 0.0036149s
+matrix_randomize: 0.0073586s
+matrix_transpose: 0.0058029s
+matrix_multiply: 1.57465s
+matrix_multiply: 2.1565s
+matrix_RtAR: 3.73763s
+matrix_trace: 1.49e-05s
+1.75932e+08
+test_func: 3.75655s
+t=1: n=928
+matrix_randomize: 0.0038179s
+matrix_randomize: 0.0058834s
+matrix_transpose: 0.0037703s
+matrix_multiply: 0.463579s
+matrix_multiply: 0.385973s
+matrix_RtAR: 0.854033s
+matrix_trace: 2.59e-05s
+1.00156e+08
+test_func: 0.86811s
+t=2: n=1024
+matrix_randomize: 0.0049764s
+matrix_randomize: 0.0043571s
+matrix_transpose: 0.0112257s
+matrix_multiply: 1.86208s
+matrix_multiply: 1.89948s
+matrix_RtAR: 3.77345s
+matrix_trace: 1.25e-05s
+1.34324e+08
+test_func: 3.78877s
+t=3: n=1056
+matrix_randomize: 0.0032734s
+matrix_randomize: 0.0041083s
+matrix_transpose: 0.0088871s
+matrix_multiply: 1.37759s
+matrix_multiply: 1.42704s
+matrix_RtAR: 2.81431s
+matrix_trace: 1.3e-05s
+1.47405e+08
+test_func: 2.82567s
+overall: 11.2438s
 ```
 
 # 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+t=0: n=1120
+matrix_randomize: 0.0044706s
+matrix_randomize: 0.0009012s
+matrix_transpose: 0.004934s
+matrix_multiply: 1.09293s
+matrix_multiply: 1.22578s
+matrix_RtAR: 2.32385s
+matrix_trace: 0.0004732s
+1.75932e+08
+test_func: 2.34133s
+t=1: n=928
+matrix_randomize: 0.0010455s
+matrix_randomize: 0.0007095s
+matrix_transpose: 0.0026342s
+matrix_multiply: 0.274808s
+matrix_multiply: 0.29029s
+matrix_RtAR: 0.56793s
+matrix_trace: 0.0004422s
+1.00156e+08
+test_func: 0.573571s
+t=2: n=1024
+matrix_randomize: 0.0029974s
+matrix_randomize: 0.0044855s
+matrix_transpose: 0.001516s
+matrix_multiply: 1.6364s
+matrix_multiply: 1.66931s
+matrix_RtAR: 3.30743s
+matrix_trace: 0.0011572s
+1.34324e+08
+test_func: 3.32015s
+t=3: n=1056
+matrix_randomize: 0.0014983s
+matrix_randomize:0.0014983s
+matrix_transpose: 0.0028232s
+matrix_multiply: 0.700534s
+matrix_multiply: 0.729549s
+matrix_RtAR: 1.43312s
+matrix_trace: 0.0017723s
+1.47405e+08
+test_func: 1.44506s
+overall: 7.68174s
 ```
 
 # 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
-
-> 如果记录了多种优化方法，可以做表格比较
+function | time(pre) | time(optimized) |speedup
+--- | --- | --- | ---
+matrix_randomize | 0.00467375s | 0.0022007875s | 2.124x
+matrix_transpose | 0.00742150s | 0.00297685s | 2.493x
+matrix_multiply | 1.3933615s | 0.9524501s | 1.463x
+matrix_RtAR | 2.79485575s | 1.90808250s | 1.465x
 
 # 优化方法
 
-下面这些函数你是如何优化的？是什么思路？用了老师上课的哪个知识点？
-
 > matrix_randomize
 
-请回答。
+改成 YX 序遍历；改用 _mm256_stream_ps 直写内存。
 
 > matrix_transpose
 
-请回答。
+用 loop tiling + morton ordering 的方式优化，分块时保证 `BLOCKSIZE ^ 2` 小于L1缓存大小，利用 TBB 的 `tbb::simple_partitioner` 进行 morton ordering 遍历。
 
 > matrix_multiply
 
-请回答。
+同 `matrix_transpose` 的优化，但用一个临时变量额外存累加和，最后向 `out(x, y)` 写结果。
 
 > matrix_RtAR
 
-请回答。
+将两个临时变量局部静态化，即手动池化，防止重复分配内存。p.s. 虽然没有多线程，但还是用课上的 `thread_local` 修饰，保证线程安全。
 
 # 我的创新点
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,8 @@ add_executable(main main.cpp)
 find_package(OpenMP REQUIRED)
 target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
 
-#find_package(TBB REQUIRED)
-#target_link_libraries(main PUBLIC TBB::tbb)
+find_package(TBB REQUIRED)
+target_link_libraries(main PUBLIC TBB::tbb)
 
 if (MSVC)
     target_compile_options(main PUBLIC /fp:fast /arch:AVX)

diff --git a/main.cpp b/main.cpp
@@ -8,8 +8,12 @@
 // 并行可以用 OpenMP 也可以用 TBB
 
 #include <iostream>
-//#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
+#include <cstring>
+#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
 //#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
+#include <tbb/partitioner.h>
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
@@ -24,11 +28,22 @@ static void matrix_randomize(Matrix &out) {
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
+    // 答：不够高效是因为内部循环遍历 y 导致访存不是连续的，而是跳跃的，降低访存效率。
+    // 可以通过改变循环的轴，并采用直写进行优化。
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            float val = wangsrng(x, y).next_float();
-            out(x, y) = val;
+    for (int y = 0; y < ny; ++y) {
+        for (int x = 0; x < nx; x += 8) { 
+            __m256 res = {
+                wangsrng(x, y).next_float(),
+                wangsrng(x+1, y).next_float(),
+                wangsrng(x+2, y).next_float(),
+                wangsrng(x+3, y).next_float(),
+                wangsrng(x+4, y).next_float(),
+                wangsrng(x+5, y).next_float(),
+                wangsrng(x+6, y).next_float(),
+                wangsrng(x+7, y).next_float()
+            };
+            _mm256_stream_ps(&out(x, y), res);
         }
     }
     TOCK(matrix_randomize);
@@ -41,12 +56,20 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
-#pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            out(y, x) = in(x, y);
-        }
-    }
+    // 答：内部循环访存 out(y, x) 是连续的，但访问 in(x, y) 不是。
+    // 可以用 loop tiling + morton ordering 的方式优化
+
+    const int BLOCK_SIZE = 32; // 32 * 32 * 4B = 4KB = 1 page and n mod 32 = 0
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0, nx, BLOCK_SIZE, 0, ny, BLOCK_SIZE),
+        [&in, &out](const tbb::blocked_range2d<size_t> &r) {
+            for (int x = r.rows().begin(); x != r.rows().end(); ++x) {
+                for (int y = r.cols().begin(); y != r.cols().end(); ++y) {
+                    out(y, x) = in(x, y);
+                }
+            }
+        }, 
+        tbb::simple_partitioner{}
+    );
     TOCK(matrix_transpose);
 }
 
@@ -62,23 +85,34 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
-#pragma omp parallel for collapse(2)
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
-            for (int t = 0; t < nt; t++) {
-                out(x, y) += lhs(x, t) * rhs(t, y);
+    // 答：最内部循环访存 rhs(t, y) 是连续的，但访问 out(x, y) 与 lhs(x, t) 不是。
+    // 可以用 loop tiling + morton ordering 的方式优化
+    const int BLOCK_SIZE = 32; // 32 * 32 * 4B = 4KB = 1 page and n mod 32 = 0
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0, nx, BLOCK_SIZE, 0, ny, BLOCK_SIZE),
+        [&out, &lhs, &rhs, nt](const tbb::blocked_range2d<size_t> &r) {
+            for (int x = r.rows().begin(); x != r.rows().end(); ++x) {
+                for (int y = r.cols().begin(); y != r.cols().end(); ++y) {
+                    float tmp = 0.0f;
+                    for (int t = 0; t < nt; ++t) {
+                        tmp += lhs(x, t) * rhs(t, y);
+                    }
+                    out(x, y) = tmp; // 有没有必要手动初始化？ 5 分
+                    // 没必要初始化，也可以用临时变量记录和。
+                }
             }
-        }
-    }
+        }, 
+        tbb::simple_partitioner{}
+    );
     TOCK(matrix_multiply);
 }
 
 // 求出 R^T A R
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
-    Matrix Rt, RtA;
+    // 可以将其局部静态化，以防重复初始化；tread_local 保证多线程安全。
+    static thread_local Matrix Rt, RtA;
+
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);