9 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
14 const uint32_t entry_count,
17 for (int32_t i = start; i < entry_count; i += step) {
18 buffer_ptr[i] = init_val;
20 return reinterpret_cast<int8_t*
>(buffer_ptr + entry_count);
24 int64_t* groups_buffer,
25 const int64_t* init_vals,
26 const uint32_t groups_buffer_entry_count,
27 const uint32_t key_count,
28 const uint32_t agg_col_count,
29 const int8_t* col_sizes,
30 const bool need_padding,
32 const int8_t key_size) {
33 const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
34 const int32_t step = blockDim.x * gridDim.x;
36 int8_t* buffer_ptr =
reinterpret_cast<int8_t*
>(groups_buffer);
38 for (uint32_t i = 0; i < key_count; ++i) {
41 buffer_ptr = init_columnar_buffer<int8_t>(
42 buffer_ptr,
EMPTY_KEY_8, groups_buffer_entry_count, start, step);
46 init_columnar_buffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
48 groups_buffer_entry_count,
54 init_columnar_buffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
56 groups_buffer_entry_count,
62 init_columnar_buffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
64 groups_buffer_entry_count,
76 for (int32_t i = 0; i < agg_col_count; ++i) {
80 switch (col_sizes[i]) {
82 buffer_ptr = init_columnar_buffer<int8_t>(
83 buffer_ptr, init_vals[init_idx++], groups_buffer_entry_count, start, step);
86 buffer_ptr = init_columnar_buffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
87 init_vals[init_idx++],
88 groups_buffer_entry_count,
93 buffer_ptr = init_columnar_buffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
94 init_vals[init_idx++],
95 groups_buffer_entry_count,
100 buffer_ptr = init_columnar_buffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
101 init_vals[init_idx++],
102 groups_buffer_entry_count,
116 template <
typename K>
118 const uint32_t key_count,
120 for (uint32_t i = 0; i < key_count; ++i) {
121 keys_ptr[i] = empty_key;
126 const int64_t* init_vals,
127 const uint32_t groups_buffer_entry_count,
128 const uint32_t key_count,
129 const uint32_t key_width,
130 const uint32_t row_size_quad,
132 const int8_t warp_size) {
133 const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
134 const int32_t step = blockDim.x * gridDim.x;
136 for (int32_t i = start;
137 i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
139 groups_buffer[i] = init_vals[i % row_size_quad];
145 for (int32_t i = start; i < groups_buffer_entry_count; i += step) {
146 int64_t* keys_ptr = groups_buffer + i * row_size_quad;
150 reinterpret_cast<int32_t*>(keys_ptr), key_count,
EMPTY_KEY_32);
154 reinterpret_cast<int64_t*>(keys_ptr), key_count,
EMPTY_KEY_64);
161 const uint32_t values_off_quad =
163 for (uint32_t i = start; i < groups_buffer_entry_count; i += step) {
164 int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
165 const uint32_t val_count =
166 row_size_quad - values_off_quad;
167 for (uint32_t j = 0; j < val_count; ++j) {
168 vals_ptr[j] = init_vals[j];
175 int64_t* groups_buffer,
176 const int64_t* init_vals,
177 const uint32_t groups_buffer_entry_count,
178 const uint32_t key_count,
179 const uint32_t agg_col_count,
180 const int8_t* col_sizes,
181 const bool need_padding,
183 const int8_t key_size) {
186 groups_buffer_entry_count,
196 const int64_t* init_vals,
197 const uint32_t groups_buffer_entry_count,
198 const uint32_t key_count,
199 const uint32_t key_width,
200 const uint32_t row_size_quad,
202 const int8_t warp_size,
203 const size_t block_size_x,
204 const size_t grid_size_x) {
206 init_group_by_buffer_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
209 groups_buffer_entry_count,
219 const int64_t* init_vals,
220 const uint32_t groups_buffer_entry_count,
221 const uint32_t key_count,
222 const uint32_t agg_col_count,
223 const int8_t* col_sizes,
224 const bool need_padding,
226 const int8_t key_size,
227 const size_t block_size_x,
228 const size_t grid_size_x) {
233 qe_cuda_stream>>>(groups_buffer,
235 groups_buffer_entry_count,
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
__device__ int8_t * init_columnar_buffer(T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
Macros and functions for groupby buffer compaction.
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
CUstream getQueryEngineCudaStream()
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
#define checkCudaErrors(err)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)