-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_add_cuda.cu
94 lines (78 loc) · 2.95 KB
/
vector_add_cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <algorithm>
#include <cstddef>
#include <vector>
#include <iterator>
#include <string>
#include <iostream>
#include <numeric>
__global__ void add_kernel(const int* a, const int* b, int* result, int count) {
// Get our global thread ID
int id = blockIdx.x * blockDim.x + threadIdx.x;
// Check for out of bounds threads
if (id < count) {
result[id] = a[id] + b[id];
}
}
std::vector<int> add(const std::vector<int>& a, const std::vector<int>& b) {
// We are going to operate on the common indexes subset
std::vector<int> result(std::min(a.size(), b.size()));
if (result.size() == 0) return {};
const auto byte_size = result.size() * sizeof(int);
int* device_a = nullptr;
int* device_b = nullptr;
int* device_result = nullptr;
// Device allocation
cudaMalloc(&device_a, byte_size);
cudaMalloc(&device_b, byte_size);
cudaMalloc(&device_result, byte_size);
// Copy host vectors to device
cudaMemcpy(device_a, a.data(), byte_size, cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b.data(), byte_size, cudaMemcpyHostToDevice);
// Number of threads in each thread block
const auto blockSize = 1024;
// Number of thread blocks in grid
const auto gridSize = static_cast<int>(
std::ceil(static_cast<float>(result.size()) / blockSize)
);
// Execute the kernel
add_kernel<<<gridSize, blockSize>>>(device_a, device_b,
device_result, result.size());
// Copy result back to host
cudaMemcpy(result.data(), device_result, result.size() * sizeof(int),
cudaMemcpyDeviceToHost);
cudaFree(device_a);
cudaFree(device_b);
cudaFree(device_result);
return result;
}
std::vector<int> make_dataset(std::size_t size) {
std::vector<int> dataset(size);
std::iota(std::begin(dataset), std::end(dataset), 0);
std::random_shuffle(std::begin(dataset), std::end(dataset));
return dataset;
}
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "Usage: " << argv[0] << " <items count>" << std::endl;
return 1;
}
const auto count = std::stoll(argv[1]);
const auto a = make_dataset(count);
const auto b = make_dataset(count);
const auto result = add(a, b);
// Correctness check
auto expected = std::vector<int>(count);
std::transform(std::begin(a), std::end(a), std::begin(b), std::begin(expected),
std::plus<int>{});
const auto diff =
std::mismatch(std::begin(result), std::end(result), std::begin(expected),
[](auto a, auto b) { return a == b; });
const auto result_it = diff.first;
const auto expected_it = diff.second;
if (result_it != std::end(result)) {
const auto diff_idx = std::distance(std::begin(result), result_it);
std::cerr << "diff at index " << diff_idx << ": " << *result_it
<< " != " << *expected_it << std::endl;
return 1;
}
}