-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampleFp16.cu
118 lines (97 loc) · 3.03 KB
/
sampleFp16.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// clamp x to range [a, b]
__device__ unsigned char clamp(float x, float a, float b)
{
return (unsigned char)(max(a, min(b, x)));
}
__device__ int clamp(int x, int a, int b)
{
return max(a, min(b, x));
}
// convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b)
{
r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f);
return (int(b)<<16) | (int(g)<<8) | int(r);
}
__global__ void
cudaProcessHalf(unsigned char *g_odata, short *g_indata, unsigned char* imageData, int imgw)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x*bw + tx;
int y = blockIdx.y*bh + ty;
unsigned short a = g_indata[y*imgw+x];
float gain;
gain = __half2float(a);
float b = imageData[(y*imgw+x)*3 ];
float g = imageData[(y*imgw+x)*3+1];
float r = imageData[(y*imgw+x)*3+2];
g_odata[(y*imgw+x)*3 ] = clamp(b * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+1] = clamp(g * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+2] = clamp(r * gain, 0.0f, 255.0f);
}
extern "C" void
launchCudaProcessHalf(dim3 grid, dim3 block, int sbytes,
short *gain,
unsigned char *imageInput,
unsigned char *imageOutput,
int imgw)
{
cudaProcessHalf<<< grid, block, sbytes >>>(imageOutput, gain, imageInput, imgw);
}
__global__ void
cudaProcessFloat(unsigned char *g_odata, float *g_indata, unsigned char* imageData, int imgw)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x*bw + tx;
int y = blockIdx.y*bh + ty;
float gain = g_indata[y*imgw+x];
float b = imageData[(y*imgw+x)*3 ];
float g = imageData[(y*imgw+x)*3+1];
float r = imageData[(y*imgw+x)*3+2];
g_odata[(y*imgw+x)*3 ] = clamp(b * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+1] = clamp(g * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+2] = clamp(r * gain, 0.0f, 255.0f);
}
extern "C" void
launchCudaProcessFloat(dim3 grid, dim3 block, int sbytes,
float *gain,
unsigned char *imageInput,
unsigned char *imageOutput,
int imgw)
{
cudaProcessFloat<<< grid, block, sbytes >>>(imageOutput, gain, imageInput, imgw);
}
__global__ void
cudaProcessByte(unsigned char *g_odata, unsigned char *g_indata, unsigned char* imageData, int imgw)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x*bw + tx;
int y = blockIdx.y*bh + ty;
float gain = (float)g_indata[y*imgw+x] / 255.0f;
float b = imageData[(y*imgw+x)*3 ];
float g = imageData[(y*imgw+x)*3+1];
float r = imageData[(y*imgw+x)*3+2];
g_odata[(y*imgw+x)*3 ] = clamp(b * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+1] = clamp(g * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+2] = clamp(r * gain, 0.0f, 255.0f);
}
extern "C" void
launchCudaProcessByte(dim3 grid, dim3 block, int sbytes,
unsigned char *gain,
unsigned char *imageInput,
unsigned char *imageOutput,
int imgw)
{
cudaProcessByte<<< grid, block, sbytes >>>(imageOutput, gain, imageInput, imgw);
}