-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathBlender.cu
83 lines (73 loc) · 2.42 KB
/
Blender.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
//#include <device_launch_parameters.h>
#include <opencv2/gpu/device/saturate_cast.hpp>
#include "MyBlender.h"
using namespace std;
namespace cv {
namespace gpu {
namespace device {
template<typename T>
__global__ void kernelFeed(int height, int width, T *dst, const T *src, const uchar *mask, int dStep, int sStep, int mStep)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
//int offset = x + y * mStep; // offset for mask
if (x < width && y < height && mask[x + y * mStep])
{
int dOffset = x * 3 + y * dStep; // offset for dst
int offset = x * 3 + y * sStep; // offset for src
dst[dOffset] = src[offset];
dst[dOffset + 1] = src[offset + 1];
dst[dOffset + 2] = src[offset + 2];
}
}
void cudaFeed(const gpu::GpuMat &image, const gpu::GpuMat &mask, gpu::GpuMat &dst, int dx, int dy)
{
dim3 threads(16, 16); // 256 threads yealds better performance
dim3 blocks(image.cols / threads.x, image.rows / threads.y);
switch (image.type())
{
case CV_8UC3:
kernelFeed<<<blocks, threads>>>(image.rows, image.cols,
dst.ptr<uchar>(dy) + dx * 3, image.ptr<uchar>(), mask.ptr<uchar>(),
dst.step, image.step, mask.step);
cudaDeviceSynchronize();
break;
case CV_16SC3:
kernelFeed<<<blocks, threads>>>(image.rows, image.cols,
dst.ptr<short>(dy) + dx * 3, image.ptr<short>(), mask.ptr<uchar>(),
dst.step, image.step, mask.step);
cudaDeviceSynchronize();
break;
}
}
void cudaFeed(const Mat &image, const Mat &mask, gpu::GpuMat &dst, int dx, int dy)
{
gpu::GpuMat gpuImg;
gpu::GpuMat gpuMask;
gpuImg.upload(image);
gpuMask.upload(mask);
cudaFeed(gpuImg, gpuMask, dst, dx, dy);
}
void cudaFeed(const Mat &image, const Mat &mask, Mat &dst, int dx, int dy)
{
dim3 threads(8, 8); // 64 threads yealds better performance
dim3 blocks(image.cols / threads.x, image.rows / threads.y);
switch (image.type())
{
case CV_8UC3:
kernelFeed<<<blocks, threads>>>(image.rows, image.cols,
dst.ptr<uchar>(dy) + dx * 3, image.ptr<uchar>(), mask.ptr<uchar>(),
dst.step, image.step, mask.step);
cudaDeviceSynchronize();
break;
case CV_16SC3:
kernelFeed<<<blocks, threads>>>(image.rows, image.cols,
dst.ptr<short>(dy) + dx * 3, image.ptr<short>(), mask.ptr<uchar>(),
dst.step, image.step, mask.step);
cudaDeviceSynchronize();
break;
}
}
} // namespace device
} // namespace gpu
} // namespace cv;