-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroup_by_per_thread_array.cuh
197 lines (178 loc) · 8.6 KB
/
group_by_per_thread_array.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#pragma once
#include "group_by_global_array.cuh"
// 8 bytes for the aggregate value + 1 bit (0.125 bytes) for the occurs flag
#define PER_THREAD_ARRAY_ENTRY_SIZE 8.125
#define PER_THREAD_ENTRY_BITS 3
#define PER_THREAD_ARRAY_SHARED_MEM_SUFFICIENT(GROUP_BITS, BLOCK_DIM) \
(CUDA_SHARED_MEM_PER_BLOCK >= ((((size_t)1) << (GROUP_BITS)) * \
PER_THREAD_ARRAY_ENTRY_SIZE * BLOCK_DIM))
GROUP_BY_GLOBAL_ARRAY_FORWARD_REQUIREMENTS(group_by_per_thread_array);
static inline bool approach_per_thread_array_available(
int group_bits, size_t row_count, int grid_dim, int block_dim,
int stream_count)
{
if (!grid_dim || !block_dim) return false;
return PER_THREAD_ARRAY_SHARED_MEM_SUFFICIENT(group_bits, block_dim);
}
template <int MAX_GROUP_BITS>
__global__ void kernel_per_thread_array_bank_optimized(
db_table input, uint64_t* global_array, bool* global_occurance_array,
int stream_count, int stream_idx)
{
// ternary guards against template instantiations that would
// cause ptxas error during compilations by requiring
// too much shared memory even if these instantiations are never used
constexpr bool SANE_INSTANCE =
PER_THREAD_ARRAY_SHARED_MEM_SUFFICIENT(MAX_GROUP_BITS, CUDA_WARP_SIZE);
constexpr int GROUP_COUNT = SANE_INSTANCE ? (int)1 << MAX_GROUP_BITS : 1;
// for ex. on the 1080ti, 48 KiB of sm per block is 32Kib -> pow of 2
// + 16 KiB overhang that more than suffice to store all the flag bits
constexpr bool FLAGS_FIT_IN_NON_POW2_SHARED_MEM_OVERHANG =
(CUDA_SHARED_MEM_PER_BLOCK -
(((size_t)1) << CUDA_SHARED_MEM_BITS_PER_BLOCK)) >=
(GROUP_COUNT * CUDA_MAX_BLOCK_SIZE / 8);
constexpr int MAX_BLOCK_BITS =
SANE_INSTANCE ? CUDA_SHARED_MEM_BITS_PER_BLOCK - MAX_GROUP_BITS -
PER_THREAD_ENTRY_BITS -
!FLAGS_FIT_IN_NON_POW2_SHARED_MEM_OVERHANG
: 1;
constexpr int MAX_BLOCK_SIZE = 1 << MAX_BLOCK_BITS;
constexpr int OCC_FLAGS_PACK_SIZE = 8 * sizeof(uint32_t);
constexpr int OCC_FLAGS_ARRAY_SIZE = GROUP_COUNT > OCC_FLAGS_PACK_SIZE
? GROUP_COUNT / OCC_FLAGS_PACK_SIZE
: 1;
// we will always have sufficient occurance flag bits since
// GROUP_COUNT must be a power of two -> smaller than or divisible
// by OCC_FLAGS_PACK_SIZE
static_assert(OCC_FLAGS_ARRAY_SIZE * OCC_FLAGS_PACK_SIZE >= GROUP_COUNT);
const int ARR_STRIDE = blockDim.x;
int tid = threadIdx.x + blockIdx.x * blockDim.x +
stream_idx * blockDim.x * gridDim.x;
int stride = blockDim.x * gridDim.x * stream_count;
__shared__ uint32_t aggregate_low_bytes[GROUP_COUNT * MAX_BLOCK_SIZE];
__shared__ uint32_t aggregate_high_bytes[GROUP_COUNT * MAX_BLOCK_SIZE];
__shared__ uint32_t occurance_flags[OCC_FLAGS_ARRAY_SIZE * MAX_BLOCK_SIZE];
for (int i = 0; i < GROUP_COUNT; i++) {
aggregate_low_bytes[i * ARR_STRIDE + threadIdx.x] = 0;
aggregate_high_bytes[i * ARR_STRIDE + threadIdx.x] = 0;
}
for (int i = 0; i < OCC_FLAGS_ARRAY_SIZE; i++) {
occurance_flags[i * ARR_STRIDE + threadIdx.x] = 0;
}
for (size_t i = tid; i < input.row_count; i += stride) {
uint64_t group = input.group_col[i];
uint64_t aggregate = input.aggregate_col[i];
int arr_idx = group * ARR_STRIDE + threadIdx.x;
uint64_t stored_agg = aggregate_low_bytes[arr_idx];
stored_agg |= ((uint64_t)aggregate_high_bytes[arr_idx]) << 32;
stored_agg += aggregate;
aggregate_low_bytes[arr_idx] = (uint32_t)stored_agg;
aggregate_high_bytes[arr_idx] = (uint32_t)(stored_agg >> 32);
occurance_flags
[(group / OCC_FLAGS_PACK_SIZE) * ARR_STRIDE + threadIdx.x] |=
(uint32_t)1 << (group % OCC_FLAGS_PACK_SIZE);
}
for (int i = 0; i < GROUP_COUNT; i += OCC_FLAGS_PACK_SIZE) {
int end = i + OCC_FLAGS_PACK_SIZE <= GROUP_COUNT ? OCC_FLAGS_PACK_SIZE
: GROUP_COUNT - i;
uint32_t occ_pack = occurance_flags
[(i / OCC_FLAGS_PACK_SIZE) * ARR_STRIDE + threadIdx.x];
for (int j = 0; j < end; j++) {
if ((occ_pack >> j) & 0x1 == 1) {
uint64_t group = i + j;
size_t arr_idx = group * ARR_STRIDE + threadIdx.x;
uint64_t aggregate = aggregate_low_bytes[arr_idx];
aggregate |= ((uint64_t)aggregate_high_bytes[arr_idx]) << 32;
global_array_insert<true>(
global_array, global_occurance_array, group, aggregate);
}
}
}
}
template <int MAX_GROUP_BITS>
__global__ void kernel_per_thread_array(
db_table input, uint64_t* global_array, bool* global_occurance_array,
int stream_count, int stream_idx)
{
// ternary guards against template instantiations that would
// cause ptxas error during compilations by requiring
// too much shared memory even if these instantiations are never used
constexpr int GROUP_COUNT =
PER_THREAD_ARRAY_SHARED_MEM_SUFFICIENT(MAX_GROUP_BITS, CUDA_WARP_SIZE)
? (int)1 << MAX_GROUP_BITS
: 1;
constexpr int OCC_FLAGS_PACK_SIZE = 8 * sizeof(uint32_t);
constexpr int OCC_FLAGS_ARRAY_SIZE = GROUP_COUNT > OCC_FLAGS_PACK_SIZE
? GROUP_COUNT / OCC_FLAGS_PACK_SIZE
: 1;
// we will always have sufficient occurance flag bits since GROUP_COUNT
// must be a power of two -> smaller than or divisible by 64
static_assert(OCC_FLAGS_ARRAY_SIZE * OCC_FLAGS_PACK_SIZE >= GROUP_COUNT);
uint64_t per_thread_array[GROUP_COUNT];
uint32_t per_thread_occurance_flags[OCC_FLAGS_ARRAY_SIZE];
int tid = threadIdx.x + blockIdx.x * blockDim.x +
stream_idx * blockDim.x * gridDim.x;
int stride = blockDim.x * gridDim.x * stream_count;
for (int i = 0; i < GROUP_COUNT; i++) {
per_thread_array[i] = 0;
}
for (int i = 0; i < OCC_FLAGS_ARRAY_SIZE; i++) {
per_thread_occurance_flags[i] = 0;
}
for (size_t i = tid; i < input.row_count; i += stride) {
uint64_t group = input.group_col[i];
uint64_t aggregate = input.aggregate_col[i];
per_thread_array[group] += aggregate;
per_thread_occurance_flags[group / OCC_FLAGS_PACK_SIZE] |=
(uint64_t)1 << (group % OCC_FLAGS_PACK_SIZE);
}
for (int i = 0; i < GROUP_COUNT; i += OCC_FLAGS_PACK_SIZE) {
int end = i + OCC_FLAGS_PACK_SIZE < GROUP_COUNT ? OCC_FLAGS_PACK_SIZE
: GROUP_COUNT - i;
uint32_t occ_pack = per_thread_occurance_flags[i / OCC_FLAGS_PACK_SIZE];
for (int j = 0; j < end; j++) {
if ((occ_pack >> j) & 0x1) {
uint64_t group = i + j;
global_array_insert<true>(
global_array, global_occurance_array, group,
per_thread_array[group]);
}
}
}
}
template <int MAX_GROUP_BITS, bool BANK_OPTIMIZED = false>
void group_by_per_thread_array(
gpu_data* gd, int grid_dim, int block_dim, int stream_count,
cudaStream_t* streams, cudaEvent_t* events, cudaEvent_t start_event,
cudaEvent_t end_event)
{
CUDA_TRY(cudaEventRecord(start_event));
// reset number of groups found
size_t zero = 0;
cudaMemcpyToSymbol(
global_array_groups_found, &zero, sizeof(zero), 0,
cudaMemcpyHostToDevice);
// for stream_count 0 we use the default stream,
// but thats actually still one stream not zero
int actual_stream_count = stream_count ? stream_count : 1;
for (int i = 0; i < actual_stream_count; i++) {
cudaStream_t stream = stream_count ? streams[i] : 0;
if (BANK_OPTIMIZED) {
kernel_per_thread_array_bank_optimized<MAX_GROUP_BITS>
<<<grid_dim, block_dim, 0, stream>>>(
gd->input, global_array, global_array_occurance_flags,
actual_stream_count, i);
}
else {
kernel_per_thread_array<MAX_GROUP_BITS>
<<<grid_dim, block_dim, 0, stream>>>(
gd->input, global_array, global_array_occurance_flags,
actual_stream_count, i);
}
// if we have only one stream there is no need for waiting events
if (stream_count > 1) cudaEventRecord(events[i], stream);
}
group_by_global_array_writeout<MAX_GROUP_BITS>(
gd, grid_dim, block_dim, stream_count, streams, events, start_event,
end_event);
}