-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimpleMul.cu
103 lines (80 loc) · 2.65 KB
/
simpleMul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <cuda.h>
#define TILE_WIDTH 16
#define cudaCheckError() { \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
printf("CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
exit(1); \
} \
}
__global__ void SimpleMulKernel (float *Nd, float *Pd, int width, int height)
{
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
if (Row < height && Col < width)
{
float Pvalue = 0;
for (int k = 0; k < height; k++)
Pvalue += Nd[k*width+Row] * Nd[k*width+Col];
Pd[Row*width+Col] = Pvalue;
}
}
int main(int argc, char* argv[])
{
float *A_h, *C_h;
float *A_d, *C_d;
int i, width, height, size_A, size_C;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
srand(time(NULL));
if (argc != 3)
{
printf("Provide the problem size.\n");
return -1;
}
width = atoi(argv[2]);
height = atoi(argv[1]);
size_A = width * height * sizeof(float);
size_C = height * height * sizeof(float);
//memory allocation for host matrixes
A_h = (float *)malloc(size_A);
C_h = (float *)malloc(size_C);
if ((A_h == NULL) || (C_h == NULL))
{
printf("Could not allocate memory.\n");
return -2;
}
//initialization of matrixes
for (i = 0; i < width*height; i++) {
A_h[i] = (rand() % 100) / 100.00;
}
//memory allocation of device matrixes
cudaMalloc((void**) &A_d, size_A); cudaCheckError();
cudaMalloc((void**) &C_d, size_C); cudaCheckError();
//copy Host matrixes to Device matrixes
cudaMemcpy(A_d, A_h, size_A, cudaMemcpyHostToDevice); cudaCheckError();
//dimensions of device
dim3 dimGrid(((width-1)/TILE_WIDTH)+1, ((height-1)/TILE_WIDTH)+1, 1);
dim3 dimBLock(TILE_WIDTH,TILE_WIDTH,1);
cudaEventRecord(start);
//calculation of multiplication
SimpleMulKernel<<<dimGrid, dimBLock>>>(A_d, C_d, width, height);
cudaCheckError();
cudaEventRecord(stop);
//copy device results to host
cudaMemcpy(C_h, C_d, size_C, cudaMemcpyDeviceToHost); cudaCheckError();
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Milliseconds: %f\n", milliseconds);
//free device memory
cudaFree(A_d); cudaCheckError();
cudaFree(C_d); cudaCheckError();
}