forked from NVIDIA-Korea/CUDA
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprofile_step1.cu
97 lines (74 loc) · 1.95 KB
/
profile_step1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* compile command
compile : nvcc -arch=sm_35 a.cu -o a.out
profile : nvprof -o log1.o ./a.out
view : nvvp log1.o
*/
#include <cstdio>
__global__ void init_data_kernel( int n, double* x){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i < n )
{
x[i] = n - i;
}
}
__global__ void daxpy_kernel(int n, double a, double * x, double * y){
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) {
y[i] = a*x[i] + y[i];
}
}
__global__ void check_results_kernel( int n, double correctvalue, double * x ){
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) {
if ( x[i] != correctvalue )
{
printf("ERROR at index = %d, expected = %f, actual: %f\n",i,correctvalue,x[i]);
}
}
}
void init_host_data( int n, double * x ){
for (int i=0; i<n; ++i) {
x[i] = i;
}
}
void init_data(int n, double* x, double* x_d, double* y_d){
cudaStream_t copy_stream;
cudaStream_t compute_stream;
cudaStreamCreate(©_stream);
cudaStreamCreate(&compute_stream);
cudaMemcpyAsync( x_d, x, n*sizeof(double), cudaMemcpyDefault, copy_stream );
init_data_kernel<<<ceil(n/256),256,0,compute_stream>>>(n, y_d);
cudaStreamSynchronize(copy_stream);
cudaStreamSynchronize(compute_stream);
cudaStreamDestroy(compute_stream);
cudaStreamDestroy(copy_stream);
}
void daxpy(int n, double a, double* x_d, double* y_d){
daxpy_kernel<<<ceil(n/256),256>>>(n,a,x_d,y_d);
cudaDeviceSynchronize();
}
void check_results( int n, double correctvalue, double* x_d ){
check_results_kernel<<<ceil(n/256),256>>>(n,correctvalue,x_d);
}
void run_test(int n){
double* x;
double* x_d;
double* y_d;
cudaSetDevice(0);
cudaMallocHost((void**) &x, n*sizeof(double));
cudaMalloc((void**)&x_d,n*sizeof(double));
cudaMalloc((void**)&y_d,n*sizeof(double));
init_host_data(n, x);
init_data(n,x,x_d,y_d);
daxpy(n,1.0,x_d,y_d);
check_results(n, n, y_d);
cudaFree(y_d);
cudaFree(x_d);
cudaFreeHost(x);
cudaDeviceSynchronize();
}
int main(){
int n = 1<<22;
run_test(n);
return 0;
}