-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimage-mat.cu
47 lines (41 loc) · 2.76 KB
/
image-mat.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include "image-mat.h"
#include <cmath>
#include <cuda_runtime.h>
#include <cstdlib>
#include <cstring>
#include <stdexcept>
using namespace std;
#define CAFFE_CUDA_NUM_THREADS (1024)
#define CAFFE_GET_BLOCKS(n) (((n) + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS)
template <typename T> __global__ void image2MatGpuKernel(const int n, const T* dataImage, const int inputNum, const int height, const int width, const int kernelLength, const int channelNum, const int pad, const int stride, const int nextLayerHeight, const int nextLayerWidth, T* dataCol) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += blockDim.x * gridDim.x) {
int widthOffsetInOutput = index % nextLayerWidth;
int heightOffsetInOutput = (index / nextLayerWidth) % nextLayerHeight;
int imageIdx = index / nextLayerWidth / nextLayerHeight % inputNum;
int filterIdx = index / nextLayerWidth / nextLayerHeight / inputNum;
int filterOffsetInOutput = filterIdx * kernelLength * kernelLength;
int heightOffsetInInput = heightOffsetInOutput * stride - pad;
int widthOffsetInInput = widthOffsetInOutput * stride - pad;
dataCol += ((filterOffsetInOutput * inputNum + imageIdx) * nextLayerHeight + heightOffsetInOutput) * nextLayerWidth + widthOffsetInOutput;
dataImage += ((imageIdx * channelNum + filterIdx) * height + heightOffsetInInput) * width + widthOffsetInInput;
for (int i = 0; i < kernelLength; ++i) {
for (int j = 0; j < kernelLength; ++j) {
int h = heightOffsetInInput + i;
int w = widthOffsetInInput + j;
*dataCol = (0 <= h && 0 <= w && w < width && h < height) ? dataImage[i * width + j] : 0;
dataCol += nextLayerHeight * nextLayerWidth * inputNum;
}
}
}
}
template <typename T> void image2MatGpu(const T* dataImage, const int inputNum, const int channelNum, const int height, const int width, const int kernelLength, const int pad, const int stride, T* dataCol, cudaStream_t stream) {
// We are going to launch channelNum * nextLayerHeight * nextLayerWidth kernels, each kernel responsible for copying a single-channel grid.
int nextLayerHeight = (height + 2 * pad - kernelLength) / stride + 1;
int nextLayerWidth = (width + 2 * pad - kernelLength) / stride + 1;
int kernelNum = inputNum * channelNum * nextLayerHeight * nextLayerWidth;
image2MatGpuKernel<T><<<CAFFE_GET_BLOCKS(kernelNum), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(kernelNum, dataImage, inputNum, height, width, kernelLength, channelNum, pad, stride, nextLayerHeight, nextLayerWidth, dataCol);
if (cudaGetLastError() != cudaSuccess) {
throw runtime_error("Device kernel failed");
}
}
template void image2MatGpu<float>(const float*, const int, const int, const int, const int, const int, const int, const int, float*, cudaStream_t);