Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

finish hw07 #11

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 99 additions & 16 deletions ANSWER.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,110 @@
# 实验环境
* 16cores 32threads
```
L1d: 512 KiB (16 instances)
L1i: 512 KiB (16 instances)
L2: 8 MiB (16 instances)
L3: 64 MiB (2 instances)
```

# 改进前

```
这里贴改进前的运行结果。
matrix_randomize: 100s
t=0: n=1120
matrix_randomize: 0.00971097s
matrix_randomize: 0.00703878s
matrix_transpose: 0.00927667s
matrix_multiply: 0.117476s
matrix_multiply: 0.117298s
matrix_RtAR: 0.244081s
matrix_trace: 0.00835175s
1.75932e+08
test_func: 0.274363s
t=1: n=928
matrix_randomize: 0.006546s
matrix_randomize: 0.00750128s
matrix_transpose: 0.010777s
matrix_multiply: 0.0455778s
matrix_multiply: 0.0537599s
matrix_RtAR: 0.110256s
matrix_trace: 0.00894227s
1.00156e+08
test_func: 0.141392s
t=2: n=1024
matrix_randomize: 0.00713252s
matrix_randomize: 0.00754496s
matrix_transpose: 0.00931327s
matrix_multiply: 0.194533s
matrix_multiply: 0.187665s
matrix_RtAR: 0.391531s
matrix_trace: 0.00910213s
1.34324e+08
test_func: 0.419415s
t=3: n=1056
matrix_randomize: 0.0118236s
matrix_randomize: 0.00950726s
matrix_transpose: 0.00668107s
matrix_multiply: 0.0764283s
matrix_multiply: 0.0805108s
matrix_RtAR: 0.163727s
matrix_trace: 0.00896033s
1.47405e+08
test_func: 0.197871s
overall: 1.0342s
```

# 改进后

```
这里贴改进后的运行结果。
matrix_randomize: 0.01s
t=0: n=1120
matrix_randomize: 0.000273793s
matrix_randomize: 0.000278233s
matrix_transpose: 0.00315305s
matrix_multiply: 0.0296057s
matrix_multiply: 0.0279403s
matrix_RtAR: 0.060762s
matrix_trace: 0.00917836s
1.75932e+08
test_func: 0.0749736s
t=1: n=928
matrix_randomize: 0.000221557s
matrix_randomize: 0.000185628s
matrix_transpose: 0.000508399s
matrix_multiply: 0.0319896s
matrix_multiply: 0.0188709s
matrix_RtAR: 0.0516253s
matrix_trace: 0.00931379s
1.00156e+08
test_func: 0.0648803s
t=2: n=1024
matrix_randomize: 0.000262414s
matrix_randomize: 0.000265054s
matrix_transpose: 0.000564065s
matrix_multiply: 0.0198131s
matrix_multiply: 0.0238569s
matrix_RtAR: 0.0443844s
matrix_trace: 0.0134706s
1.34324e+08
test_func: 0.062279s
t=3: n=1056
matrix_randomize: 0.000309561s
matrix_randomize: 0.000290673s
matrix_transpose: 0.000471451s
matrix_multiply: 0.0261711s
matrix_multiply: 0.0282188s
matrix_RtAR: 0.0549805s
matrix_trace: 0.00934559s
1.47405e+08
test_func: 0.0692714s
overall: 0.273484s
```

# 加速比

matrix_randomize: 10000x
matrix_transpose: 10000x
matrix_multiply: 10000x
matrix_RtAR: 10000x
matrix_randomize: 50x
matrix_transpose: 20x
matrix_multiply: 6x
matrix_RtAR: 4x

> 如果记录了多种优化方法,可以做表格比较

Expand All @@ -26,20 +113,16 @@ matrix_RtAR: 10000x
下面这些函数你是如何优化的?是什么思路?用了老师上课的哪个知识点?

> matrix_randomize

请回答。
更改循环XY顺序,提高cache命中率即可

> matrix_transpose

请回答。
tbb+分块

> matrix_multiply

请回答。
寄存器分块+内部unroll

> matrix_RtAR

请回答。
static thread local

# 我的创新点

Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ add_executable(main main.cpp)
find_package(OpenMP REQUIRED)
target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)

#find_package(TBB REQUIRED)
#target_link_libraries(main PUBLIC TBB::tbb)
find_package(TBB REQUIRED)
target_link_libraries(main PUBLIC TBB::tbb)

if (MSVC)
target_compile_options(main PUBLIC /fp:fast /arch:AVX)
Expand Down
60 changes: 57 additions & 3 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,60 @@
// 并行可以用 OpenMP 也可以用 TBB

#include <iostream>
//#include <x86intrin.h> // _mm 系列指令都来自这个头文件
#include <x86intrin.h> // _mm 系列指令都来自这个头文件
//#include <xmmintrin.h> // 如果上面那个不行,试试这个
#include "ndarray.h"
#include "wangsrng.h"
#include "ticktock.h"
#include <tbb/blocked_range.h>
#include <tbb/blocked_range2d.h>
#include <tbb/parallel_for.h>
#include <tbb/partitioner.h>


// Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x]
using Matrix = ndarray<2, float>;
// 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>>

// using Matrix = ndarray<2, float, 0, 0, AlignedAllocator<float, 4096>>;
static void matrix_randomize(Matrix &out) {
TICK(matrix_randomize);
size_t nx = out.shape(0);
size_t ny = out.shape(1);

// 这个循环为什么不够高效?如何优化? 10 分
// Matrix是 YX 序列的,此循环按照XY顺序遍历二维数组,导致数据不是顺序访问,没有利用好cache
/*
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
float val = wangsrng(x, y).next_float();
out(x, y) = val;
}
}
*/

for (int y = 0; y < ny; y ++) {
for (int x = 0; x < nx; x ++) {
float val = wangsrng(x, y).next_float();
out(x, y) = val;
}
}
TOCK(matrix_randomize);
}



static void matrix_transpose(Matrix &out, Matrix const &in) {
TICK(matrix_transpose);
size_t nx = in.shape(0);
size_t ny = in.shape(1);
out.reshape(ny, nx);

// 这个循环为什么不够高效?如何优化? 15 分
// Matrix是YX序列,此循环按照XY顺序访问二维数组,虽然out(y,x)是顺序访问,但是in(x, y)仍然是跳跃访问,
// 违背了空间局域性,每次跳跃了ny,所以只要缓存容量小于ny就无法命中,因此采用循环分块的方法,使得
// block_size^2的大小小于缓存容量,即可全部命中
/*
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
Expand All @@ -49,6 +70,22 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
}
TOCK(matrix_transpose);
}
*/

constexpr int block_size = 64;
tbb::parallel_for(tbb::blocked_range2d<size_t>(0,ny, block_size, 0, nx, block_size),
[&](const tbb::blocked_range2d<size_t> &r){
for(size_t y=r.cols().begin(); y<r.cols().end(); y++){
for(size_t x=r.rows().begin(); x<r.rows().end(); x++){
out(x,y) = in(y,x);
}
}
},
tbb::simple_partitioner{}
);

TOCK(matrix_transpose);
}

static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
TICK(matrix_multiply);
Expand All @@ -62,6 +99,8 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
out.reshape(nx, ny);

// 这个循环为什么不够高效?如何优化? 15 分
// lhs(x, t) 跳跃访问,无法矢量化,cpu也无法启动指令级并行
/*
#pragma omp parallel for collapse(2)
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
Expand All @@ -73,12 +112,27 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
}
TOCK(matrix_multiply);
}
*/
#pragma omp parallel for collapse(2)
for (int y = 0; y < ny; y ++) {
for (int xBase = 0; xBase < nx; xBase += 32) {
for (int t = 0; t < nt; t ++) {
#pragma GCC unroll 4
for (int x = xBase; x < xBase + 32; x ++) {
out(x, y) += lhs(x, t) * rhs(t, y);
}
}
}
}
TOCK(matrix_multiply);
}

// 求出 R^T A R
static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
TICK(matrix_RtAR);
// 这两个是临时变量,有什么可以优化的? 5 分
Matrix Rt, RtA;
// 声明为静态局部变量 避免重复创建
static thread_local Matrix Rt, RtA;
matrix_transpose(Rt, R);
matrix_multiply(RtA, Rt, A);
matrix_multiply(RtAR, RtA, R);
Expand Down