parallel101 · Daydream0929 · Jan 12, 2024
diff --git a/ANSWER.md b/ANSWER.md
@@ -1,23 +1,110 @@
+# 实验环境
+* 16cores 32threads
+```
+L1d:                   512 KiB (16 instances)
+L1i:                   512 KiB (16 instances)
+L2:                    8 MiB (16 instances)
+L3:                    64 MiB (2 instances)
+```
+
 # 改进前
 
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+t=0: n=1120
+matrix_randomize: 0.00971097s
+matrix_randomize: 0.00703878s
+matrix_transpose: 0.00927667s
+matrix_multiply: 0.117476s
+matrix_multiply: 0.117298s
+matrix_RtAR: 0.244081s
+matrix_trace: 0.00835175s
+1.75932e+08
+test_func: 0.274363s
+t=1: n=928
+matrix_randomize: 0.006546s
+matrix_randomize: 0.00750128s
+matrix_transpose: 0.010777s
+matrix_multiply: 0.0455778s
+matrix_multiply: 0.0537599s
+matrix_RtAR: 0.110256s
+matrix_trace: 0.00894227s
+1.00156e+08
+test_func: 0.141392s
+t=2: n=1024
+matrix_randomize: 0.00713252s
+matrix_randomize: 0.00754496s
+matrix_transpose: 0.00931327s
+matrix_multiply: 0.194533s
+matrix_multiply: 0.187665s
+matrix_RtAR: 0.391531s
+matrix_trace: 0.00910213s
+1.34324e+08
+test_func: 0.419415s
+t=3: n=1056
+matrix_randomize: 0.0118236s
+matrix_randomize: 0.00950726s
+matrix_transpose: 0.00668107s
+matrix_multiply: 0.0764283s
+matrix_multiply: 0.0805108s
+matrix_RtAR: 0.163727s
+matrix_trace: 0.00896033s
+1.47405e+08
+test_func: 0.197871s
+overall: 1.0342s
 ```
 
 # 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+t=0: n=1120
+matrix_randomize: 0.000273793s
+matrix_randomize: 0.000278233s
+matrix_transpose: 0.00315305s
+matrix_multiply: 0.0296057s
+matrix_multiply: 0.0279403s
+matrix_RtAR: 0.060762s
+matrix_trace: 0.00917836s
+1.75932e+08
+test_func: 0.0749736s
+t=1: n=928
+matrix_randomize: 0.000221557s
+matrix_randomize: 0.000185628s
+matrix_transpose: 0.000508399s
+matrix_multiply: 0.0319896s
+matrix_multiply: 0.0188709s
+matrix_RtAR: 0.0516253s
+matrix_trace: 0.00931379s
+1.00156e+08
+test_func: 0.0648803s
+t=2: n=1024
+matrix_randomize: 0.000262414s
+matrix_randomize: 0.000265054s
+matrix_transpose: 0.000564065s
+matrix_multiply: 0.0198131s
+matrix_multiply: 0.0238569s
+matrix_RtAR: 0.0443844s
+matrix_trace: 0.0134706s
+1.34324e+08
+test_func: 0.062279s
+t=3: n=1056
+matrix_randomize: 0.000309561s
+matrix_randomize: 0.000290673s
+matrix_transpose: 0.000471451s
+matrix_multiply: 0.0261711s
+matrix_multiply: 0.0282188s
+matrix_RtAR: 0.0549805s
+matrix_trace: 0.00934559s
+1.47405e+08
+test_func: 0.0692714s
+overall: 0.273484s
 ```
 
 # 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
+matrix_randomize: 50x
+matrix_transpose: 20x
+matrix_multiply: 6x
+matrix_RtAR: 4x
 
 > 如果记录了多种优化方法，可以做表格比较
 
@@ -26,20 +113,16 @@ matrix_RtAR: 10000x
 下面这些函数你是如何优化的？是什么思路？用了老师上课的哪个知识点？
 
 > matrix_randomize
-
-请回答。
+更改循环XY顺序，提高cache命中率即可
 
 > matrix_transpose
-
-请回答。
+tbb+分块
 
 > matrix_multiply
-
-请回答。
+寄存器分块+内部unroll
 
 > matrix_RtAR
-
-请回答。
+static thread local
 
 # 我的创新点
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,8 @@ add_executable(main main.cpp)
 find_package(OpenMP REQUIRED)
 target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
 
-#find_package(TBB REQUIRED)
-#target_link_libraries(main PUBLIC TBB::tbb)
+find_package(TBB REQUIRED)
+target_link_libraries(main PUBLIC TBB::tbb)
 
 if (MSVC)
     target_compile_options(main PUBLIC /fp:fast /arch:AVX)

diff --git a/main.cpp b/main.cpp
@@ -8,39 +8,60 @@
 // 并行可以用 OpenMP 也可以用 TBB
 
 #include <iostream>
-//#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
+#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
 //#include <xmmintrin.h>  // 如果上面那个不行，试试这个
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
+#include <tbb/blocked_range.h>
+#include <tbb/blocked_range2d.h>
+#include <tbb/parallel_for.h>
+#include <tbb/partitioner.h>
+
 
 // Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
 using Matrix = ndarray<2, float>;
 // 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>
-
+// using Matrix = ndarray<2, float, 0, 0, AlignedAllocator<float, 4096>>;
 static void matrix_randomize(Matrix &out) {
     TICK(matrix_randomize);
     size_t nx = out.shape(0);
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
+    // Matrix是 YX 序列的，此循环按照XY顺序遍历二维数组，导致数据不是顺序访问，没有利用好cache
+/*
 #pragma omp parallel for collapse(2)
     for (int x = 0; x < nx; x++) {
         for (int y = 0; y < ny; y++) {
             float val = wangsrng(x, y).next_float();
             out(x, y) = val;
         }
     }
+*/
+
+    for (int y = 0; y < ny; y ++) {
+        for (int x = 0; x < nx; x ++) {
+            float val = wangsrng(x, y).next_float();
+            out(x, y) = val;
+        }
+    }
     TOCK(matrix_randomize);
 }
 
+
+
 static void matrix_transpose(Matrix &out, Matrix const &in) {
     TICK(matrix_transpose);
     size_t nx = in.shape(0);
     size_t ny = in.shape(1);
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    // Matrix是YX序列，此循环按照XY顺序访问二维数组，虽然out(y,x)是顺序访问，但是in(x, y)仍然是跳跃访问，
+    // 违背了空间局域性，每次跳跃了ny，所以只要缓存容量小于ny就无法命中，因此采用循环分块的方法，使得
+    // block_size^2的大小小于缓存容量，即可全部命中
+/*
 #pragma omp parallel for collapse(2)
     for (int x = 0; x < nx; x++) {
         for (int y = 0; y < ny; y++) {
@@ -49,6 +70,22 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     }
     TOCK(matrix_transpose);
 }
+*/
+
+    constexpr int block_size = 64;
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,ny, block_size, 0, nx, block_size),
+        [&](const tbb::blocked_range2d<size_t> &r){
+            for(size_t y=r.cols().begin(); y<r.cols().end(); y++){
+                for(size_t x=r.rows().begin(); x<r.rows().end(); x++){
+                    out(x,y) = in(y,x);
+                }
+            }
+        },
+        tbb::simple_partitioner{}
+        );
+
+    TOCK(matrix_transpose);
+}
 
 static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     TICK(matrix_multiply);
@@ -62,6 +99,8 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    // lhs(x, t) 跳跃访问，无法矢量化，cpu也无法启动指令级并行
+/*
 #pragma omp parallel for collapse(2)
     for (int y = 0; y < ny; y++) {
         for (int x = 0; x < nx; x++) {
@@ -73,12 +112,27 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     }
     TOCK(matrix_multiply);
 }
+*/
+#pragma omp parallel for collapse(2)
+    for (int y = 0; y < ny; y ++) {
+        for (int xBase = 0; xBase < nx; xBase += 32) {
+            for (int t = 0; t < nt; t ++) {
+#pragma GCC unroll 4
+                for (int x = xBase; x < xBase + 32; x ++) {
+                    out(x, y) += lhs(x, t) * rhs(t, y);
+                }
+            }
+        }
+    }
+    TOCK(matrix_multiply);
+}
 
 // 求出 R^T A R
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
-    Matrix Rt, RtA;
+    // 声明为静态局部变量 避免重复创建
+    static thread_local Matrix Rt, RtA;
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);