-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_fft_8_floats_stockham.cpp
117 lines (98 loc) · 4.57 KB
/
main_fft_8_floats_stockham.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
constexpr auto kernel_file = "vector_fft_floats_stockham.cl";
int main(void) {
// Create the input vector
std::vector<float> input{2.5f, 9.f, -3.f, 5.f, 10.f, 4.f, 1.f, 7.f};
const unsigned int Sz = input.size(); // is assumed to be a power of 2
auto twiddle = imajuscule::compute_roots_of_unity<float>(Sz);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
CHECK_CL_ERROR(ret);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
CHECK_CL_ERROR(ret);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
CHECK_CL_ERROR(ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
CHECK_CL_ERROR(ret);
auto bufSz = input.size() * sizeof(decltype(input[0]));
// Create memory buffers on the device for each vector
cl_mem input_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
bufSz, NULL, &ret);
cl_mem twiddle_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
bufSz*2, NULL, &ret);
CHECK_CL_ERROR(ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
bufSz*2 /* because these are complex<float>*/, NULL, &ret);
CHECK_CL_ERROR(ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, input_mem_obj, CL_TRUE, 0,
bufSz, input.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, twiddle_mem_obj, CL_TRUE, 0,
twiddle.size()*sizeof(decltype(twiddle[0])), twiddle.data(), 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// Create a program from the kernel source
auto kernel_src = read_kernel(kernel_file);
auto kernel_c_src = kernel_src.c_str();
auto source_size = kernel_src.size();
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&kernel_c_src, (const size_t *)&source_size, &ret);
CHECK_CL_ERROR(ret);
// Build the program
std::string options = std::string("-I ") + src_root() +
" -cl-denorms-are-zero -cl-strict-aliasing -cl-fast-relaxed-math";
ret = clBuildProgram(program, 1, &device_id, options.c_str(), NULL, NULL);
CHECK_CL_ERROR(ret);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "kernel_func", &ret);
CHECK_CL_ERROR(ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&twiddle_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 3, 2*sizeof(float) * 2*input.size(), NULL); // pingpong buffer
CHECK_CL_ERROR(ret);
// Execute the OpenCL kernel on the list
size_t global_item_size = input.size()/2; // the number of butterfly operations per fft level
size_t local_item_size = global_item_size;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// Read the memory buffer output_mem_obj on the device to the local variable output
std::vector<std::complex<float>> output;
output.resize(input.size());
ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
bufSz*2, output.data(), 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// The output produced by the gpu is the same as the output produced by the cpu:
verifyVectorsAreEqual(output, makeRefForwardFft(input));
verifyVectorsAreEqual(cpu_fft_norecursion_stockham(input),makeRefForwardFft(input));
// Clean up
ret = clFlush(command_queue);
CHECK_CL_ERROR(ret);
ret = clFinish(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseKernel(kernel);
CHECK_CL_ERROR(ret);
ret = clReleaseProgram(program);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(twiddle_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(output_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseCommandQueue(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseContext(context);
CHECK_CL_ERROR(ret);
return 0;
}