forked from etotheipi/CUDA-Image-Processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcudaImageHost.h
271 lines (229 loc) · 7.37 KB
/
cudaImageHost.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#ifndef _CUDA_IMAGE_HOST_H_CU_
#define _CUDA_IMAGE_HOST_H_CU_
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
using namespace std;
////////////////////////////////////////////////////////////////////////////////
//
// A very simple class for creating, storing and deleting image in *HOST* RAM
//
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
class cudaImageHost
{
private:
DTYPE* imgData_;
int imgRows_;
int imgCols_;
int imgElts_;
int imgBytes_;
void Allocate(int nRows, int nCols);
void Deallocate(void);
void MemcpyIn(DTYPE* dataIn);
public:
void resize(int nRows, int nCols);
cudaImageHost();
cudaImageHost(int nRows, int nCols);
cudaImageHost(DTYPE* data, int nRows, int nCols);
cudaImageHost(string filename, int nRows, int nCols);
cudaImageHost(cudaImageHost const & img2);
~cudaImageHost();
void operator=(cudaImageHost const & img2);
bool operator==(cudaImageHost const & img2) const;
DTYPE operator()(int r, int c) const { return imgData_[r*imgCols_+c];}
DTYPE & operator()(int r, int c) { return imgData_[r*imgCols_+c];}
DTYPE operator[](int e) const { return imgData_[e]; }
DTYPE & operator[](int e) { return imgData_[e]; }
void readFile(string filename, int nRows, int nCols);
void writeFile(string filename) const;
void printImage(void) const;
void printMask(char zero='.', char one='H') const; // 'H' for "Host"
DTYPE* getDataPtr(void) const {return imgData_;}
int numRows(void) const {return imgRows_;}
int numCols(void) const {return imgCols_;}
int numElts(void) const {return imgElts_;}
int numBytes(void) const {return imgBytes_;}
// This method is really only for timing tests. Obviously we created
// this library so we can use the GPU for 50-200x speed up.
void Dilate(cudaImageHost SE, cudaImageHost & target);
};
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::Allocate(int nRows, int nCols)
{
imgRows_ = nRows;
imgCols_ = nCols;
imgElts_ = imgRows_*imgCols_;
imgBytes_ = imgElts_*sizeof(DTYPE);
if(nRows == 0 || nCols == 0)
imgData_ = NULL;
else
{
imgData_ = (DTYPE*)malloc(imgBytes_);
}
}
template<class DTYPE>
void cudaImageHost<DTYPE>::MemcpyIn(DTYPE* dataIn)
{
memcpy(imgData_, dataIn, imgBytes_);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::Deallocate(void)
{
if(imgData_ != NULL)
free(imgData_);
imgData_ = NULL;
imgRows_ = imgCols_ = imgElts_ = imgBytes_ = 0;
}
template<class DTYPE>
void cudaImageHost<DTYPE>::resize(int nRows, int nCols)
{
// If we already have the right amount of memory, don't do anything
if( imgElts_ == nRows*nCols)
{
// imgElts_ and imgBytes_ already correct, don't need to realloc
imgRows_ = nRows;
imgCols_ = nCols;
}
else
{
Deallocate();
Allocate(nRows, nCols);
}
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::~cudaImageHost()
{
Deallocate();
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::cudaImageHost() :
imgData_(NULL), imgRows_(0), imgCols_(0), imgElts_(0), imgBytes_(0) { }
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::cudaImageHost(int nRows, int nCols) :
imgData_(NULL), imgRows_(0), imgCols_(0), imgElts_(0), imgBytes_(0)
{
Allocate(nRows, nCols);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::cudaImageHost(DTYPE* data, int nRows, int nCols) :
imgData_(NULL), imgRows_(0), imgCols_(0), imgElts_(0), imgBytes_(0)
{
Allocate(nRows, nCols);
MemcpyIn(data);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::cudaImageHost(string filename, int nRows, int nCols) :
imgData_(NULL), imgRows_(0), imgCols_(0), imgElts_(0), imgBytes_(0)
{
Allocate(nRows, nCols);
readFile(filename, nRows, nCols);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
cudaImageHost<DTYPE>::cudaImageHost(cudaImageHost const & i2) :
imgData_(NULL), imgRows_(0), imgCols_(0), imgElts_(0), imgBytes_(0)
{
Allocate(i2.imgRows_, i2.imgCols_);
MemcpyIn(i2.imgData_);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::operator=(cudaImageHost const & i2)
{
resize(i2.imgRows_, i2.imgCols_);
MemcpyIn(i2.imgData_);
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
bool cudaImageHost<DTYPE>::operator==(cudaImageHost const & i2) const
{
bool isEq = true;
if(imgRows_ != i2.imgRows_ || imgCols_ != i2.imgCols_)
isEq = false;
else
for(int e=0; e<imgElts_; e++)
if(imgData_[e] != i2.imgData_[e])
isEq = false;
return isEq;
}
////////////////////////////////////////////////////////////////////////////////
// Most of CUDA stuff will be done in Row-major format, but files store data
// in Col-major format, which is why we switch the normal order of the loops
template<class DTYPE>
void cudaImageHost<DTYPE>::readFile(string filename, int nRows, int nCols)
{
resize(nRows, nCols);
ifstream is(filename.c_str(), ios::in);
for(int r=0; r<imgRows_; r++)
for(int c=0; c<imgCols_; c++)
is >> imgData_[r*imgCols_ + c];
is.close();
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::writeFile(string filename) const
{
ofstream os(filename.c_str(), ios::out);
for(int r=0; r<imgRows_; r++)
{
for(int c=0; c<imgCols_; c++)
os << imgData_[r*imgCols_+c] << " ";
os << endl;
}
os.close();
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::printMask(char zero, char one) const
{
for(int r=0; r<imgRows_; r++)
{
for(int c=0; c<imgCols_; c++)
{
int val = imgData_[r*imgCols_+c];
if(val == 0)
cout << zero;
else
cout << one;
cout << " ";
}
cout << endl;
}
}
////////////////////////////////////////////////////////////////////////////////
template<class DTYPE>
void cudaImageHost<DTYPE>::printImage(void) const
{
for(int r=0; r<imgRows_; r++)
{
for(int c=0; c<imgCols_; c++)
{
cout << imgData_[r*imgCols_+c] << endl;
}
cout << endl;
}
}
// This is the dumbest, simplest algorithm I could come up with. There are most
// definitely more efficient way to implement it. I just want an to get an
// order-of-magnitude timing
template<class DTYPE>
void cudaImageHost<DTYPE>::Dilate(cudaImageHost SE, cudaImageHost & target)
{
int seH = SE.numRows();
int seW = SE.numCols();
int imgH = imgRows_;
int imgW = imgCols_;
target.resize(imgW, imgH);
}
#endif