-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmm.cu
68 lines (49 loc) · 1.6 KB
/
mm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include <iostream>
#include <cuda_runtime.h>
#define N 512
__global__ void matrixMultiply(float *A, float *B, float *C) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int i = 0; i < N; i++) {
sum += A[row * N + i] * B[i * N + col];
}
C[row * N + col] = sum;
}
int main() {
float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;
h_A = new float[N * N];
h_B = new float[N * N];
h_C = new float[N * N];
for (int i = 0; i < N * N; i++) {
h_A[i] = 1.0f;
h_B[i] = 2.0f;
}
cudaMalloc(&d_A, N * N * sizeof(float));
cudaMalloc(&d_B, N * N * sizeof(float));
cudaMalloc(&d_C, N * N * sizeof(float));
cudaMemcpy(d_A, h_A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * N * sizeof(float), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / 16, N / 16);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matrixMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "Kernel ran for " << milliseconds << " milliseconds" << std::endl;
cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);
// Print or further process result matrix h_C
delete[] h_A;
delete[] h_B;
delete[] h_C;
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}