-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu_hashtable.cu
197 lines (180 loc) · 5.96 KB
/
gpu_hashtable.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#include <iostream>
#include <limits.h>
#include <stdlib.h>
#include <ctime>
#include <sstream>
#include <string>
#include <stdint.h>
#include "gpu_hashtable.hpp"
/* INIT HASH
*/
GpuHashTable::GpuHashTable(int size) {
hash_size = size; // actual size of hashtable
num_entries = 0; // number of occupied slots
cudaMalloc((void **) &hashtable, size * sizeof(entry));
cudaMemset(hashtable, KEY_INVALID, size * sizeof(entry));
}
/* DESTROY HASH
*/
GpuHashTable::~GpuHashTable() {
cudaFree(hashtable);
}
/* Hash function used by hashtable
*/
__device__ uint32_t hash_func(int data, int limit) {
return ((long)abs(data) * 105359939) % 1685759167 % limit;
}
/* resize function that will be run by GPU
*/
__global__ void resize(GpuHashTable::entry *hashtable, GpuHashTable::entry *new_hash,
int hash_size, int numBucketsReshape) {
/* each thread will copy one element from hashtable to new_hash */
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < hash_size) {
if (hashtable[tid].key == KEY_INVALID)
return;
/* rehash each key */
uint32_t key = hash_func(hashtable[tid].key, numBucketsReshape);
while (true) {
/* find empty slot and add pair */
uint32_t prev = atomicCAS(&new_hash[key].key, KEY_INVALID, hashtable[tid].key);
if (prev == hashtable[tid].key || prev == KEY_INVALID) {
new_hash[key].value = hashtable[tid].value;
break;
}
key++;
key %= numBucketsReshape;
}
}
}
/* RESHAPE HASH
*/
void GpuHashTable::reshape(int numBucketsReshape) {
uint32_t block_size = 100;
uint32_t blocks_no = hash_size / block_size;
if (hash_size % block_size)
++blocks_no;
struct entry *new_hash;
/* alloc new hash */
cudaMalloc((void **) &new_hash, numBucketsReshape * sizeof(entry));
cudaMemset(new_hash, KEY_INVALID, numBucketsReshape * sizeof(entry));
resize<<<blocks_no, block_size>>>(hashtable, new_hash, hash_size, numBucketsReshape);
cudaDeviceSynchronize();
cudaFree(hashtable);
hashtable = new_hash;
hash_size = numBucketsReshape;
}
/* insert function that will be run by GPU
*/
__global__ void insert(GpuHashTable::entry *hashtable, int hash_size,
int *keys, int* values, int numKeys) {
/* each thread will insert one element into hashtable */
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numKeys) {
/* compute hash for key */
uint32_t key = hash_func(keys[tid], hash_size);
while (true) {
/* find empty spot or update value if the key already exists */
uint32_t prev = atomicCAS(&hashtable[key].key, KEY_INVALID, keys[tid]);
if (prev == keys[tid] || prev == KEY_INVALID) {
hashtable[key].value = values[tid];
return;
}
key++;
key %= hash_size;
}
}
}
/* INSERT BATCH
*/
bool GpuHashTable::insertBatch(int *keys, int* values, int numKeys) {
int *new_values;
/* compute number of entries before calling insert in order to perform
* reshape if needed
*/
new_values = getBatch(keys, numKeys);
for (int i = 0; i < numKeys; i++)
if (new_values[i] == KEY_INVALID)
num_entries++;
if ((float)(num_entries) / hash_size >= 0.9)
reshape(num_entries + (int)(0.1 * num_entries));
uint32_t block_size = 100;
uint32_t blocks_no = numKeys / block_size;
if (numKeys % block_size)
++blocks_no;
int *dev_keys = 0;
int *dev_values = 0;
/* alloc memory for GPU and copy keys and values arrays into GPU memory */
cudaMalloc((void **) &dev_keys, numKeys * sizeof(int));
cudaMalloc((void **) &dev_values, numKeys * sizeof(int));
cudaMemcpy(dev_keys, keys, numKeys * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_values, values, numKeys * sizeof(int), cudaMemcpyHostToDevice);
insert<<<blocks_no, block_size>>>(hashtable, hash_size, dev_keys, dev_values, numKeys);
cudaDeviceSynchronize();
cudaFree(dev_keys);
cudaFree(dev_values);
free(new_values);
return true;
}
/* get function that will be run by GPU
*/
__global__ void get(GpuHashTable::entry *hashtable, int hash_size,
int *keys, int *values, int numKeys) {
/* each thread will add to the result array one element from hashtable
* corresponding to one key form keys array
*/
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numKeys) {
/* compute hash for key */
uint32_t key = hash_func(keys[tid], hash_size);
while (true) {
if (hashtable[key].key == keys[tid]) {
values[tid] = hashtable[key].value;
break;
}
if (hashtable[key].key == KEY_INVALID) {
values[tid] = KEY_INVALID;
break;
}
key++;
key %= hash_size;
}
}
}
/* GET BATCH
*/
int* GpuHashTable::getBatch(int* keys, int numKeys) {
int *results = (int *)malloc(numKeys * sizeof(int));
uint32_t block_size = 100;
uint32_t blocks_no = numKeys / block_size;
if (numKeys % block_size)
++blocks_no;
int *dev_keys = 0;
int *dev_values = 0;
/* alloc memory for GPU and copy keys and values arrays into GPU memory */
cudaMalloc((void **) &dev_keys, numKeys * sizeof(int));
cudaMalloc((void **) &dev_values, numKeys * sizeof(int));
cudaMemcpy(dev_keys, keys, numKeys * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(dev_values, KEY_INVALID, numKeys * sizeof(int));
get<<<blocks_no, block_size>>>(hashtable, hash_size, dev_keys, dev_values, numKeys);
cudaDeviceSynchronize();
/* copy vallues array from GPU memory into results array (CPU memory) */
cudaMemcpy(results, dev_values, numKeys * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_keys);
cudaFree(dev_values);
return results;
}
/* GET LOAD FACTOR
* num elements / hash total slots elements
*/
float GpuHashTable::loadFactor() {
return (float)num_entries / hash_size; // no larger than 1.0f = 100%
}
/*********************************************************/
#define HASH_INIT GpuHashTable GpuHashTable(1);
#define HASH_RESERVE(size) GpuHashTable.reshape(size);
#define HASH_BATCH_INSERT(keys, values, numKeys) GpuHashTable.insertBatch(keys, values, numKeys)
#define HASH_BATCH_GET(keys, numKeys) GpuHashTable.getBatch(keys, numKeys)
#define HASH_LOAD_FACTOR GpuHashTable.loadFactor()
#define HASH_DESTROY GpuHashTable.~GpuHashTable();
#include "test_map.cpp"