forked from mratsim/laser
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathex05_tensor_parallel_reduction.nim
95 lines (85 loc) · 2.68 KB
/
ex05_tensor_parallel_reduction.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ################################################################
#
# Example of using the fine-grained
# parallel forEachStaged iterator
# for parallel reduction on arbitrary number of tensors
#
# ################################################################
import ../laser/strided_iteration/[foreach, foreach_staged]
import ../laser/tensor/[datatypes, allocator, initialization]
import ../laser/[compiler_optim_hints, dynamic_stack_arrays]
import ../laser/[openmp, cpuinfo]
import sequtils
proc reduction_localsum_critical[T](x, y: Tensor[T]): T =
forEachStaged xi in x, yi in y:
openmp_config:
use_openmp: true
use_simd: false
nowait: true
omp_grain_size: OMP_MEMORY_BOUND_GRAIN_SIZE
iteration_kind:
{contiguous, strided} # Default, "contiguous", "strided" are also possible
before_loop:
var local_sum = 0.T
in_loop:
local_sum += xi + yi
after_loop:
omp_critical:
result += local_sum
proc reduction_localsum_omp_atomic[T](x, y: Tensor[T]): T =
forEachStaged xi in x, yi in y:
openmp_config:
use_simd: false
nowait: true
iteration_kind:
contiguous
before_loop:
var local_sum = 0.T
in_loop:
local_sum += xi + yi
after_loop:
{.emit: "#pragma omp atomic".}
{.emit: "`result` += `local_sum`;".}
proc reduction_localsum_system_atomic[T](x, y: Tensor[T]): T =
forEachStaged xi in x, yi in y:
openmp_config:
use_simd: false
nowait: true
iteration_kind:
contiguous
before_loop:
var local_sum = 0.T
in_loop:
local_sum += xi + yi
after_loop:
result.atomicInc local_sum # This requires --threads:on
proc reduction_padding[T](x, y: Tensor[T]): T =
let cache_line_size = int cpuinfo_get_l1d_caches().line_size
let padding = cache_line_size div sizeof(T)
var buffer: seq[T]
forEachStaged xi in x, yi in y:
openmp_config:
use_simd: false
nowait: true
iteration_kind:
contiguous
before_loop:
omp_master:
buffer = newSeq[T](omp_get_num_threads() * padding)
omp_barrier()
omp_flush(buffer)
in_loop:
buffer[omp_get_thread_num() * padding] += xi + yi
for idx in countup(0, buffer.len - 1, padding):
result += buffer[idx]
proc toTensor[T](s: seq[T]): Tensor[T] =
var size: int
initTensorMetadata(result, size, [s.len])
allocCpuStorage(result.storage, size)
result.copyFromRaw(s[0].unsafeAddr, s.len)
let a = toSeq(1..10001).toTensor
let b = toSeq(-10000 .. 0).toTensor
echo reduction_localsum_critical(a, b)
echo reduction_localsum_omp_atomic(a, b)
echo reduction_localsum_system_atomic(a, b)
echo reduction_padding(a, b)