OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuInitGroups.cu File Reference
#include <cuda.h>
#include "BufferCompaction.h"
#include "GpuInitGroups.h"
#include "GpuRtConstants.h"
#include "Logger/Logger.h"
+ Include dependency graph for GpuInitGroups.cu:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 

Functions

CUstream getQueryEngineCudaStream ()
 
template<typename T >
__device__ int8_t * init_columnar_buffer (T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
 
__device__ void init_columnar_group_by_buffer_gpu_impl (int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
 
template<typename K >
__device__ void fill_empty_device_key (K *keys_ptr, const uint32_t key_count, const K empty_key)
 
__global__ void init_group_by_buffer_gpu (int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
 
__global__ void init_columnar_group_by_buffer_gpu_wrapper (int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
 
void init_group_by_buffer_on_device (int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
 
void init_columnar_group_by_buffer_on_device (int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
 

Macro Definition Documentation

Function Documentation

template<typename K >
__device__ void fill_empty_device_key ( K *  keys_ptr,
const uint32_t  key_count,
const K  empty_key 
)
inline

Definition at line 117 of file GpuInitGroups.cu.

Referenced by init_group_by_buffer_gpu().

119  {
120  for (uint32_t i = 0; i < key_count; ++i) {
121  keys_ptr[i] = empty_key;
122  }
123 }

+ Here is the caller graph for this function:

CUstream getQueryEngineCudaStream ( )

Definition at line 3 of file QueryEngine.cpp.

Referenced by approximate_distinct_tuples_on_device_range(), cuda_kernel_launch_wrapper(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), init_columnar_group_by_buffer_on_device(), and init_group_by_buffer_on_device().

3  { // NOTE: CUstream is cudaStream_t
4  return QueryEngine::getInstance()->getCudaStream();
5 }
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:89

+ Here is the caller graph for this function:

template<typename T >
__device__ int8_t* init_columnar_buffer ( T *  buffer_ptr,
const T  init_val,
const uint32_t  entry_count,
const int32_t  start,
const int32_t  step 
)

Definition at line 12 of file GpuInitGroups.cu.

16  {
17  for (int32_t i = start; i < entry_count; i += step) {
18  buffer_ptr[i] = init_val;
19  }
20  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
21 }
__device__ void init_columnar_group_by_buffer_gpu_impl ( int64_t *  groups_buffer,
const int64_t *  init_vals,
const uint32_t  groups_buffer_entry_count,
const uint32_t  key_count,
const uint32_t  agg_col_count,
const int8_t *  col_sizes,
const bool  need_padding,
const bool  keyless,
const int8_t  key_size 
)

Definition at line 23 of file GpuInitGroups.cu.

References align_to_int64(), EMPTY_KEY_16, EMPTY_KEY_32, EMPTY_KEY_64, and EMPTY_KEY_8.

Referenced by init_columnar_group_by_buffer_gpu_wrapper().

32  {
33  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
34  const int32_t step = blockDim.x * gridDim.x;
35 
36  int8_t* buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
37  if (!keyless) {
38  for (uint32_t i = 0; i < key_count; ++i) {
39  switch (key_size) {
40  case 1:
41  buffer_ptr = init_columnar_buffer<int8_t>(
42  buffer_ptr, EMPTY_KEY_8, groups_buffer_entry_count, start, step);
43  break;
44  case 2:
45  buffer_ptr =
46  init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
48  groups_buffer_entry_count,
49  start,
50  step);
51  break;
52  case 4:
53  buffer_ptr =
54  init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
56  groups_buffer_entry_count,
57  start,
58  step);
59  break;
60  case 8:
61  buffer_ptr =
62  init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
64  groups_buffer_entry_count,
65  start,
66  step);
67  break;
68  default:
69  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
70  break;
71  }
72  buffer_ptr = align_to_int64(buffer_ptr);
73  }
74  }
75  int32_t init_idx = 0;
76  for (int32_t i = 0; i < agg_col_count; ++i) {
77  if (need_padding) {
78  buffer_ptr = align_to_int64(buffer_ptr);
79  }
80  switch (col_sizes[i]) {
81  case 1:
82  buffer_ptr = init_columnar_buffer<int8_t>(
83  buffer_ptr, init_vals[init_idx++], groups_buffer_entry_count, start, step);
84  break;
85  case 2:
86  buffer_ptr = init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
87  init_vals[init_idx++],
88  groups_buffer_entry_count,
89  start,
90  step);
91  break;
92  case 4:
93  buffer_ptr = init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
94  init_vals[init_idx++],
95  groups_buffer_entry_count,
96  start,
97  step);
98  break;
99  case 8:
100  buffer_ptr = init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
101  init_vals[init_idx++],
102  groups_buffer_entry_count,
103  start,
104  step);
105  break;
106  case 0:
107  continue;
108  default:
109  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet now.
110  break;
111  }
112  }
113  __syncthreads();
114 }
#define EMPTY_KEY_64
#define EMPTY_KEY_8
#define EMPTY_KEY_16
#define EMPTY_KEY_32
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_columnar_group_by_buffer_gpu_wrapper ( int64_t *  groups_buffer,
const int64_t *  init_vals,
const uint32_t  groups_buffer_entry_count,
const uint32_t  key_count,
const uint32_t  agg_col_count,
const int8_t *  col_sizes,
const bool  need_padding,
const bool  keyless,
const int8_t  key_size 
)

Definition at line 174 of file GpuInitGroups.cu.

References init_columnar_group_by_buffer_gpu_impl().

Referenced by init_columnar_group_by_buffer_on_device().

183  {
185  init_vals,
186  groups_buffer_entry_count,
187  key_count,
188  agg_col_count,
189  col_sizes,
190  need_padding,
191  keyless,
192  key_size);
193 }
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_columnar_group_by_buffer_on_device ( int64_t *  groups_buffer,
const int64_t *  init_vals,
const uint32_t  groups_buffer_entry_count,
const uint32_t  key_count,
const uint32_t  agg_col_count,
const int8_t *  col_sizes,
const bool  need_padding,
const bool  keyless,
const int8_t  key_size,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 218 of file GpuInitGroups.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and init_columnar_group_by_buffer_gpu_wrapper().

Referenced by QueryMemoryInitializer::createAndInitializeGroupByBufferGpu().

228  {
229  auto qe_cuda_stream = getQueryEngineCudaStream();
231  block_size_x,
232  0,
233  qe_cuda_stream>>>(groups_buffer,
234  init_vals,
235  groups_buffer_entry_count,
236  key_count,
237  agg_col_count,
238  col_sizes,
239  need_padding,
240  keyless,
241  key_size);
242  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
243 }
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_group_by_buffer_gpu ( int64_t *  groups_buffer,
const int64_t *  init_vals,
const uint32_t  groups_buffer_entry_count,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad,
const bool  keyless,
const int8_t  warp_size 
)

Definition at line 125 of file GpuInitGroups.cu.

References align_to_int64(), EMPTY_KEY_32, EMPTY_KEY_64, and fill_empty_device_key().

132  {
133  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
134  const int32_t step = blockDim.x * gridDim.x;
135  if (keyless) {
136  for (int32_t i = start;
137  i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
138  i += step) {
139  groups_buffer[i] = init_vals[i % row_size_quad];
140  }
141  __syncthreads();
142  return;
143  }
144 
145  for (int32_t i = start; i < groups_buffer_entry_count; i += step) {
146  int64_t* keys_ptr = groups_buffer + i * row_size_quad;
147  switch (key_width) {
148  case 4:
150  reinterpret_cast<int32_t*>(keys_ptr), key_count, EMPTY_KEY_32);
151  break;
152  case 8:
154  reinterpret_cast<int64_t*>(keys_ptr), key_count, EMPTY_KEY_64);
155  break;
156  default:
157  break;
158  }
159  }
160 
161  const uint32_t values_off_quad =
162  align_to_int64(key_count * key_width) / sizeof(int64_t);
163  for (uint32_t i = start; i < groups_buffer_entry_count; i += step) {
164  int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
165  const uint32_t val_count =
166  row_size_quad - values_off_quad; // value slots are always 64-bit
167  for (uint32_t j = 0; j < val_count; ++j) {
168  vals_ptr[j] = init_vals[j];
169  }
170  }
171  __syncthreads();
172 }
#define EMPTY_KEY_64
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
#define EMPTY_KEY_32
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

void init_group_by_buffer_on_device ( int64_t *  groups_buffer,
const int64_t *  init_vals,
const uint32_t  groups_buffer_entry_count,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad,
const bool  keyless,
const int8_t  warp_size,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 195 of file GpuInitGroups.cu.

References checkCudaErrors, and getQueryEngineCudaStream().

Referenced by QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(), and QueryMemoryInitializer::prepareTopNHeapsDevBuffer().

204  {
205  auto qe_cuda_stream = getQueryEngineCudaStream();
206  init_group_by_buffer_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
207  groups_buffer,
208  init_vals,
209  groups_buffer_entry_count,
210  key_count,
211  key_width,
212  row_size_quad,
213  keyless,
214  warp_size);
215  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
216 }
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function: