-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_group_by.cuh
96 lines (81 loc) · 2.37 KB
/
cuda_group_by.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#pragma once
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <cstddef>
#include <cstdint>
#include <algorithm>
#include <cuda.h>
#include "utils.cuh"
// we need this stupid type since c++ doesn't see
// uint64_t and unsigned long long int as the same type
// (even though they are on literally any plattform)
// this causes e.g. atomicAdd to complain about parameter types
typedef unsigned long long int cudaUInt64_t;
#if INTPTR_MAX == INT64_MAX
typedef unsigned long long int cudaVoidPtr_t;
#else
typedef unsigned int cudaVoidPtr_t;
#endif
#define CUDA_WARP_SIZE 32
#define CUDA_WARP_SIZE_BITS 5
#define CUDA_MAX_BLOCK_SIZE 1024
#define CUDA_MAX_BLOCK_SIZE_BITS 10
#define CUDA_L1_CACHE_LINE_SIZE 128
#define CUDA_L2_CACHE_LINE_SIZE 32
#define CUDA_MAX_CACHE_LINE_SIZE CUDA_L1_CACHE_LINE_SIZE
// 48 Kilobytes of shared memory per block
#define CUDA_SHARED_MEM_PER_BLOCK 0xC000
// -> (rounded down) 15 bits of shared memory
#define CUDA_SHARED_MEM_BITS_PER_BLOCK 15
struct db_table {
uint64_t* group_col;
uint64_t* aggregate_col;
size_t row_count;
};
struct gpu_data {
db_table input;
db_table output;
};
static inline void alloc_db_table_gpu(db_table* t, uint64_t row_count)
{
CUDA_TRY(cudaMalloc(&t->group_col, row_count * sizeof(uint64_t)));
CUDA_TRY(cudaMalloc(&t->aggregate_col, row_count * sizeof(uint64_t)));
t->row_count = row_count;
}
static inline void free_db_table_gpu(db_table* t)
{
CUDA_TRY(cudaFree(t->group_col));
CUDA_TRY(cudaFree(t->aggregate_col));
}
static inline void
gpu_data_alloc(gpu_data* gd, size_t max_groups, size_t max_rows)
{
alloc_db_table_gpu(&gd->input, max_rows);
alloc_db_table_gpu(&gd->output, max_groups);
}
static inline void gpu_data_free(gpu_data* gd)
{
free_db_table_gpu(&gd->output);
free_db_table_gpu(&gd->input);
}
static inline size_t ceil_to_pow_two(size_t v)
{
return v == 1 ? 1 : 1 << (8 * sizeof(size_t) - __builtin_clzl(v - 1));
}
static inline size_t ceil_to_mult(size_t v, size_t mult)
{
if (v % mult != 0) v += mult - (v % mult);
return v;
}
__device__ __host__ static inline void* ptradd(void* ptr, size_t val)
{
return (void*)(((char*)ptr) + val);
}
__device__ __host__ static inline size_t ptrsub(void* greater, void* smaller)
{
return ((size_t)greater) - ((size_t)smaller);
}
static inline int log2(size_t v)
{
return 8 * sizeof(size_t) - __builtin_clzll((v)) - 1;
}