-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroup_by_warp_cmp.cuh
245 lines (222 loc) · 9.13 KB
/
group_by_warp_cmp.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#pragma once
#include "cuda_group_by.cuh"
#include <type_traits>
#include "group_by_hashtable.cuh"
#define WCMP_MAX_GROUPS WCMP_CUDA_WARP_SIZE
// the minimum number of groups where we use the
// kernel_thread_per_group_more_groups kernel instead of the
// more_threads version
#define WCMP_MIN_BLOCK_DIM 32
#define WCMP_MAX_BLOCK_DIM 1024
GROUP_BY_HASHTABLE_FORWARD_REQUIREMENTS(group_by_warp_cmp)
static inline bool approach_warp_cmp_available(
int group_bits, size_t row_count, int grid_dim, int block_dim,
int stream_count)
{
const size_t group_count = (1 << group_bits);
if (!grid_dim || !block_dim) return false;
if (group_count > CUDA_WARP_SIZE) return false;
if (block_dim < CUDA_WARP_SIZE) return false;
if (block_dim > CUDA_MAX_BLOCK_SIZE) return false;
return true;
}
template <int MAX_GROUP_BITS>
__global__ void kernel_warp_cmp(
db_table input, db_table output, group_ht_entry<>* hashtable,
int stream_count, int stream_idx)
{
size_t base_idx = (size_t)blockIdx.x * blockDim.x +
(size_t)stream_idx * blockDim.x * gridDim.x;
size_t idx = threadIdx.x + base_idx;
size_t stride = (size_t)blockDim.x * gridDim.x * stream_count;
constexpr int BATCH_SIZE_UNCAPPED = (1 << MAX_GROUP_BITS);
// this number is one of {2,4,8,16,32}
constexpr int BATCH_SIZE = (BATCH_SIZE_UNCAPPED <= CUDA_WARP_SIZE)
? BATCH_SIZE_UNCAPPED
: CUDA_WARP_SIZE;
int batch_idx = threadIdx.x / BATCH_SIZE;
int batch_base = batch_idx * BATCH_SIZE;
int next_batch_base = batch_base + BATCH_SIZE;
bool group_assigned = false;
uint64_t assigned_group;
uint64_t assigned_aggregate;
__shared__ uint64_t groups[CUDA_MAX_BLOCK_SIZE];
__shared__ uint64_t aggregates[CUDA_MAX_BLOCK_SIZE];
__shared__ bool row_handled[CUDA_MAX_BLOCK_SIZE];
__shared__ int handout_ids[CUDA_MAX_BLOCK_SIZE];
__shared__ int handout_counters[CUDA_MAX_BLOCK_SIZE / BATCH_SIZE];
size_t rowcount = input.row_count;
int prev_handout_counter = batch_base;
if (threadIdx.x == batch_base) {
handout_counters[batch_idx] = batch_base;
}
handout_ids[threadIdx.x] = -1;
__syncwarp();
// group aquiring phase
// some threads in the warp have no assigned group, must handle the handout
while (base_idx < rowcount) {
// read input into shared memory, 1 element per thread
if (idx < rowcount) {
groups[threadIdx.x] = input.group_col[idx];
aggregates[threadIdx.x] = input.aggregate_col[idx];
row_handled[threadIdx.x] = false;
}
else {
row_handled[threadIdx.x] = true;
}
// each thread checks each input to see if it's responsible
__syncwarp();
for (int i = 0; i < BATCH_SIZE; i++) {
if (groups[batch_base + i] == assigned_group) {
if (group_assigned && base_idx + i < rowcount) {
row_handled[batch_base + i] = true;
assigned_aggregate += aggregates[batch_base + i];
}
}
}
idx += stride;
base_idx += stride;
// if no thread was responsible for the input this thread read in,
// hand out this group to one of the unassigned threads
__syncwarp();
if (!row_handled[threadIdx.x]) {
int hand_out_to_tid = atomicAdd(&handout_counters[batch_idx], 1);
if (hand_out_to_tid >= next_batch_base) {
hand_out_to_tid -= BATCH_SIZE;
}
handout_ids[hand_out_to_tid] = threadIdx.x;
}
// check if we have to deal with handouts
__syncwarp();
int new_handout_counter = handout_counters[batch_idx];
if (new_handout_counter == prev_handout_counter) continue;
// if our thread got handed out a group responsibility,
// check if nobody else before us got assigned the same group
// if not, take on responsibility, if yes, yield resp. to the lower tid
__syncwarp();
int handout_id = handout_ids[threadIdx.x];
bool handout_assigned = false;
if (handout_id != -1) {
handout_assigned = true;
uint64_t handed_out_group = groups[handout_id];
int i = prev_handout_counter;
while (i != threadIdx.x) {
int prev_handout_id = handout_ids[i];
if (groups[prev_handout_id] == handed_out_group) {
atomicSub(&handout_counters[batch_idx], 1);
atomicAdd(
(cudaUInt64_t*)&aggregates[prev_handout_id],
aggregates[handout_id]);
handout_assigned = false;
break;
}
i++;
if (i == next_batch_base) i = batch_base;
}
}
// if we yielded, reset up our handout id
// we have to do this after the actual yield since
// other threads may still index based on it during yield detection
__syncwarp();
if (handout_id != -1 && handout_assigned == false) {
handout_ids[threadIdx.x] = -1;
}
// if we yielded our responsibility to a lower tid, but a higher tid
// didn't yield, steal its responsibility
// the i'th thread gaining responsibility steals from the i'th non
// yielding one
__syncwarp();
int steal_from_tid = -1;
if (handout_id != -1) {
new_handout_counter = handout_counters[batch_idx];
if (!handout_assigned && new_handout_counter > threadIdx.x &&
!group_assigned) {
handout_assigned = true;
int handout_priority = 0;
// since the lowest tid can't yield, start from the next
for (int i = prev_handout_counter + 1; i < threadIdx.x; i++) {
if (handout_ids[i] == -1) handout_priority++;
}
steal_from_tid = new_handout_counter;
while (true) {
if (steal_from_tid == next_batch_base) {
steal_from_tid = batch_base;
}
handout_id = handout_ids[steal_from_tid];
if (handout_id != -1) {
if (handout_priority == 0) break;
handout_priority--;
}
steal_from_tid++;
}
}
}
// inform the tid that we stole from that it's no longer responsible
__syncwarp();
if (steal_from_tid != -1) {
handout_ids[steal_from_tid] = -1;
handout_ids[threadIdx.x] = handout_id;
}
// finally assign group if we weren't stolen from,
__syncwarp();
if (handout_assigned && handout_id != -1 &&
handout_ids[threadIdx.x] != -1) {
handout_ids[threadIdx.x] = -1;
group_assigned = true;
assigned_group = groups[handout_id];
assigned_aggregate = aggregates[handout_id];
}
// if the last thread in the warp was assigned a group, advance
// to the satisfied warp phase
prev_handout_counter = handout_counters[batch_idx];
if (prev_handout_counter == next_batch_base) {
break;
}
}
// satisfied warp phase
// every thread has been assigned a group, no need to check anymore
while (base_idx < rowcount) {
// read input into shared memory, 1 element per thread
if (idx < rowcount) {
groups[threadIdx.x] = input.group_col[idx];
aggregates[threadIdx.x] = input.aggregate_col[idx];
}
__syncwarp();
for (int i = 0; i < BATCH_SIZE; i++) {
if (groups[batch_base + i] == assigned_group) {
if (base_idx + i < rowcount) {
assigned_aggregate += aggregates[batch_base + i];
}
}
}
idx += stride;
base_idx += stride;
}
// the input is processed, proceed to writeout
if (group_assigned) {
group_ht_insert<MAX_GROUP_BITS>(
hashtable, assigned_group, assigned_aggregate);
}
}
template <int MAX_GROUP_BITS>
void group_by_warp_cmp(
gpu_data* gd, int grid_dim, int block_dim, int stream_count,
cudaStream_t* streams, cudaEvent_t* events, cudaEvent_t start_event,
cudaEvent_t end_event)
{
int actual_stream_count = stream_count ? stream_count : 1;
uint64_t zero = 0;
CUDA_TRY(cudaEventRecord(start_event));
cudaMemcpyToSymbol(
group_ht_groups_found, &zero, sizeof(zero), 0, cudaMemcpyHostToDevice);
for (int i = 0; i < actual_stream_count; i++) {
cudaStream_t stream = stream_count ? streams[i] : 0;
kernel_warp_cmp<MAX_GROUP_BITS><<<grid_dim, block_dim, 0, stream>>>(
gd->input, gd->output, group_ht_entry<>::table, actual_stream_count,
i);
if (stream_count > 1) cudaEventRecord(events[i], stream);
}
group_by_hashtable_writeout<MAX_GROUP_BITS>(
gd, grid_dim, block_dim, stream_count, streams, events, start_event,
end_event);
}