-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvector_fft_floats_multi_local_writeback.cl
61 lines (50 loc) · 1.96 KB
/
vector_fft_floats_multi_local_writeback.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include "cplx.c"
#define N_LOCAL_BUTTERFLIES replace_N_LOCAL_BUTTERFLIES // must be a power of 2
inline void butterfly_with_writeback(int const idx,
__global struct cplx *g, // we will write back the results to this
const __local struct cplx * const v,
int const i,
const __global struct cplx * const twiddle) {
struct cplx const t = cplxMult(v[idx+i], twiddle[idx]);
struct cplx const v0 = v[idx];
g[idx] = cplxAdd(v0, t);
g[idx+i] = cplxSub(v0, t);
}
__kernel void kernel_func(__local struct cplx* output, __global const float *input, __global const struct cplx *twiddle, __global struct cplx *global_output) {
int const k = get_global_id(0);
int const base_idx = k * N_LOCAL_BUTTERFLIES;
for(int j=0; j<2*N_LOCAL_BUTTERFLIES; ++j) {
int const m = 2*base_idx + j;
output[m] = complexFromReal(input[m]);
}
int const n_global_butterflies = get_global_size(0) * N_LOCAL_BUTTERFLIES;
for(int i=1; i<n_global_butterflies; i <<= 1)
{
// For the first iterations, there is no need for a memory barrier
// because we only use memory locations where our thread has written to.
if(i>N_LOCAL_BUTTERFLIES) {
barrier(CLK_LOCAL_MEM_FENCE);
}
for(int j=0; j<N_LOCAL_BUTTERFLIES; ++j)
{
int const m = base_idx + j;
int const tmp = i*(m/i);
int const idx = tmp + m;
//assert(idx+i < Sz);
int const ri = m - tmp;
int const tIdx = ri*(n_global_butterflies/i);
butterfly(output+idx,
i,
twiddle[tIdx]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int j=0; j<N_LOCAL_BUTTERFLIES; ++j)
{
butterfly_with_writeback(base_idx + j,
global_output,
output,
n_global_butterflies,
twiddle);
}
}