-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcoalescing_stride_bench.cu
108 lines (97 loc) · 2.93 KB
/
coalescing_stride_bench.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include <iomanip>
#include <iostream>
#include <cassert>
#define BLOCK_SIZE 32
#define BENCH_STEPS 100
#define MAX_STRIDE 15
#define BLOCKS 84*10
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define ASSERT(cond, msg, args...) assert((cond) || !fprintf(stderr, (msg "\n"), args))
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void clear_l2() {
// Get actual L2 size via CUDA on first call of this function
static int l2_clear_size = 0;
static unsigned char* gpu_scratch_l2_clear = NULL;
if (!gpu_scratch_l2_clear) {
cudaDeviceGetAttribute(&l2_clear_size, cudaDevAttrL2CacheSize, 0);
l2_clear_size *= 2; // just to be extra safe (cache is not necessarily strict LRU)
gpuErrchk(cudaMalloc(&gpu_scratch_l2_clear, l2_clear_size));
}
// Clear L2 cache (this is run on every call unlike the above code)
gpuErrchk(cudaMemset(gpu_scratch_l2_clear, 0, l2_clear_size));
}
__global__ void copy(int n , float* in, float* out, int stride)
{
unsigned long i = (blockIdx.x*blockDim.x + threadIdx.x)*stride;
if (i < n)
{
out[i] = in[i];
}
else
{
printf("skip load \n");
}
}
int main()
{
double timings[MAX_STRIDE+1];
float* in_d;
float* out_d;
long N = std::pow<long, long>(2, 31);
float* out_h = new float[N];
float* in_h = new float[N];
gpuErrchk(cudaMalloc((void**) &out_d, N*sizeof(float)));
gpuErrchk(cudaMalloc((void**) &in_d, N*sizeof(float)));
for (int s = -1; s<=MAX_STRIDE; s++)
{
int stride = std::pow(2, std::max(0, s));
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
dim3 dimGrid(BLOCKS, 1, 1);
dim3 dimBlock(BLOCK_SIZE, 1, 1);
float time = 0.f;
double run_time = 0.0;
for (int i = -1; i<BENCH_STEPS; i++)
{
cudaMemset(in_d, 1, N*sizeof(float));
cudaMemset(out_d, 0, N*sizeof(float));
clear_l2();
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(start));
copy<<<dimGrid, dimBlock>>>(N, out_d, in_d, stride);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
if (i != -1) // one warmup run
{
run_time += time / BENCH_STEPS;
}
}
std::cout<<stride<<" "<<run_time<<std::endl;
if (s >= 0)
{
timings[s] = run_time;
}
gpuErrchk(cudaEventDestroy(start));
gpuErrchk(cudaEventDestroy(stop));
}
std::cout<<"timings"<<" = [";
for (int i = 0; i<=MAX_STRIDE; i++)
{
std::cout<<std::fixed<<std::setprecision(6)<<timings[i]<<", ";
}
std::cout<<"]"<<std::endl;
cudaFree(in_d);
cudaFree(out_d);
return 0;
}