OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "HashJoinRuntime.cpp"
18 
19 #include <cuda.h>
21 
22 #include <thrust/device_ptr.h>
23 #include <thrust/scan.h>
24 
25 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
26 
27 template <typename F, typename... ARGS>
28 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
29  int grid_size = -1;
30  int block_size = -1;
31  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
32  auto qe_cuda_stream = getQueryEngineCudaStream();
33  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
34  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
35 }
36 
39  auto fill_hash_join_buff_func = args.type_info.uses_bw_eq
42  int partial_err = fill_hash_join_buff_func(args, -1, -1);
43  atomicCAS(args.dev_err_buff, 0, partial_err);
44 }
45 
48  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(args, -1, -1);
49  atomicCAS(args.dev_err_buff, 0, partial_err);
50 }
51 
55 }
56 
59 }
60 
63  ShardInfo const shard_info) {
64  int partial_err =
66  args.invalid_slot_val,
67  args.for_semi_join,
68  args.join_column,
69  args.type_info,
70  shard_info,
71  NULL,
72  NULL,
73  -1,
74  -1,
76  atomicCAS(args.dev_err_buff, 0, partial_err);
77 }
78 
81  ShardInfo const shard_info) {
82  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(args.buff,
83  args.invalid_slot_val,
84  args.for_semi_join,
85  args.join_column,
86  args.type_info,
87  shard_info,
88  NULL,
89  NULL,
90  -1,
91  -1);
92  atomicCAS(args.dev_err_buff, 0, partial_err);
93 }
94 
97  ShardInfo const shard_info) {
100 }
101 
104  ShardInfo const shard_info) {
106 }
107 
108 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
109  const int64_t hash_entry_count,
110  const int32_t invalid_slot_val) {
111  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
112 }
113 
114 void init_hash_join_buff_on_device(int32_t* buff,
115  const int64_t hash_entry_count,
116  const int32_t invalid_slot_val) {
118  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
119 }
120 
121 #define VALID_POS_FLAG 0
122 
123 __global__ void set_valid_pos_flag(int32_t* pos_buff,
124  const int32_t* count_buff,
125  const int64_t entry_count) {
126  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
127  const int32_t step = blockDim.x * gridDim.x;
128  for (int64_t i = start; i < entry_count; i += step) {
129  if (count_buff[i]) {
130  pos_buff[i] = VALID_POS_FLAG;
131  }
132  }
133 }
134 
135 __global__ void set_valid_pos(int32_t* pos_buff,
136  int32_t* count_buff,
137  const int64_t entry_count) {
138  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
139  const int32_t step = blockDim.x * gridDim.x;
140  for (int64_t i = start; i < entry_count; i += step) {
141  if (VALID_POS_FLAG == pos_buff[i]) {
142  pos_buff[i] = !i ? 0 : count_buff[i - 1];
143  }
144  }
145 }
146 
147 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
149  const int64_t hash_entry_count,
150  const JoinColumn& join_column,
151  const JoinColumnTypeInfo& type_info,
152  COUNT_MATCHES_FUNCTOR count_matches_func,
153  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
154  int32_t* pos_buff = buff;
155  int32_t* count_buff = buff + hash_entry_count;
156  auto qe_cuda_stream = getQueryEngineCudaStream();
158  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
159  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
160  count_matches_func();
161 
162  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
163 
164  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
166  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
167 
168  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
170  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
171  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
172  fill_row_ids_func();
173 }
174 
177  auto buff = args.buff;
178  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
179  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
181  SUFFIX(count_matches), count_buff, args.join_column, args.type_info);
182  };
183  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
185  buff,
186  hash_entry_count,
187  args.join_column,
188  args.type_info,
189  args.for_window_framing);
190  };
192  hash_entry_count,
193  args.join_column,
194  args.type_info,
195  count_matches_func,
196  fill_row_ids_func);
197 }
198 
201  auto hash_entry_count = args.hash_entry_info.getNormalizedHashEntryCount();
202  auto const buff = args.buff;
203  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
205  count_buff,
206  args.join_column,
207  args.type_info,
208  args.bucket_normalization);
209  };
210  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
212  buff,
213  hash_entry_count,
214  args.join_column,
215  args.type_info,
216  args.bucket_normalization);
217  };
219  hash_entry_count,
220  args.join_column,
221  args.type_info,
222  count_matches_func,
223  fill_row_ids_func);
224 }
225 
228  ShardInfo const shard_info) {
229  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
230  int32_t* pos_buff = args.buff;
231  int32_t* count_buff = args.buff + hash_entry_count;
232  auto qe_cuda_stream = getQueryEngineCudaStream();
234  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
235  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
237  count_buff,
238  args.join_column,
239  args.type_info,
240  shard_info);
241 
242  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
243 
244  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
246  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
247  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
249  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
250  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
252  args.buff,
253  hash_entry_count,
254  args.join_column,
255  args.type_info,
256  shard_info);
257 }
258 
259 template <typename T, typename KEY_HANDLER>
261  const T* composite_key_dict,
262  const int64_t hash_entry_count,
263  const KEY_HANDLER* key_handler,
264  const size_t num_elems,
265  const bool for_window_framing) {
266  auto pos_buff = buff;
267  auto count_buff = buff + hash_entry_count;
268  auto qe_cuda_stream = getQueryEngineCudaStream();
270  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
271  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
272  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
273  count_buff,
274  composite_key_dict,
275  hash_entry_count,
276  key_handler,
277  num_elems);
278 
279  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
280 
281  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
283  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
284  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
286  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
287  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
288 
289  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
290  buff,
291  composite_key_dict,
292  hash_entry_count,
293  key_handler,
294  num_elems,
295  for_window_framing);
296 }
297 
298 template <typename T>
299 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
300  const int64_t entry_count,
301  const size_t key_component_count,
302  const bool with_val_slot,
303  const int32_t invalid_slot_val) {
304  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
305  entry_count,
306  key_component_count,
307  with_val_slot,
308  invalid_slot_val,
309  -1,
310  -1);
311 }
312 
313 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
314  const int64_t entry_count,
315  const size_t key_component_count,
316  const bool with_val_slot,
317  const int32_t invalid_slot_val) {
318  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
319  hash_join_buff,
320  entry_count,
321  key_component_count,
322  with_val_slot,
323  invalid_slot_val);
324 }
325 
326 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
327  const int64_t entry_count,
328  const size_t key_component_count,
329  const bool with_val_slot,
330  const int32_t invalid_slot_val) {
331  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
332  hash_join_buff,
333  entry_count,
334  key_component_count,
335  with_val_slot,
336  invalid_slot_val);
337 }
338 
339 template <typename T, typename KEY_HANDLER>
340 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
341  const int64_t entry_count,
342  const int32_t invalid_slot_val,
343  const bool for_semi_join,
344  const size_t key_component_count,
345  const bool with_val_slot,
346  int* err,
347  const KEY_HANDLER* key_handler,
348  const int64_t num_elems) {
349  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
350  entry_count,
351  invalid_slot_val,
352  for_semi_join,
353  key_component_count,
354  with_val_slot,
355  key_handler,
356  num_elems,
357  -1,
358  -1);
359  atomicCAS(err, 0, partial_err);
360 }
361 
363  const int64_t entry_count,
364  const int32_t invalid_slot_val,
365  const bool for_semi_join,
366  const size_t key_component_count,
367  const bool with_val_slot,
368  int* dev_err_buff,
369  const GenericKeyHandler* key_handler,
370  const int64_t num_elems) {
372  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
373  hash_buff,
374  entry_count,
375  invalid_slot_val,
376  for_semi_join,
377  key_component_count,
378  with_val_slot,
379  dev_err_buff,
380  key_handler,
381  num_elems);
382 }
383 
385  const int64_t entry_count,
386  const int32_t invalid_slot_val,
387  const bool for_semi_join,
388  const size_t key_component_count,
389  const bool with_val_slot,
390  int* dev_err_buff,
391  const GenericKeyHandler* key_handler,
392  const int64_t num_elems) {
394  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
395  hash_buff,
396  entry_count,
397  invalid_slot_val,
398  for_semi_join,
399  key_component_count,
400  with_val_slot,
401  dev_err_buff,
402  key_handler,
403  num_elems);
404 }
405 
407  int8_t* hash_buff,
408  const int64_t entry_count,
409  const int32_t invalid_slot_val,
410  const size_t key_component_count,
411  const bool with_val_slot,
412  int* dev_err_buff,
413  const BoundingBoxIntersectKeyHandler* key_handler,
414  const int64_t num_elems) {
416  fill_baseline_hash_join_buff_wrapper<unsigned long long,
418  hash_buff,
419  entry_count,
420  invalid_slot_val,
421  false,
422  key_component_count,
423  with_val_slot,
424  dev_err_buff,
425  key_handler,
426  num_elems);
427 }
428 
430  const int64_t entry_count,
431  const int32_t invalid_slot_val,
432  const size_t key_component_count,
433  const bool with_val_slot,
434  int* dev_err_buff,
435  const RangeKeyHandler* key_handler,
436  const size_t num_elems) {
438  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
439  hash_buff,
440  entry_count,
441  invalid_slot_val,
442  false,
443  key_component_count,
444  with_val_slot,
445  dev_err_buff,
446  key_handler,
447  num_elems);
448 }
449 
451  int32_t* buff,
452  const int32_t* composite_key_dict,
453  const int64_t hash_entry_count,
454  const size_t key_component_count,
455  const GenericKeyHandler* key_handler,
456  const int64_t num_elems,
457  const bool for_window_framing) {
458  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
459  composite_key_dict,
460  hash_entry_count,
461  key_handler,
462  num_elems,
463  for_window_framing);
464 }
465 
467  int32_t* buff,
468  const int64_t* composite_key_dict,
469  const int64_t hash_entry_count,
470  const GenericKeyHandler* key_handler,
471  const int64_t num_elems,
472  const bool for_window_framing) {
473  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
474  composite_key_dict,
475  hash_entry_count,
476  key_handler,
477  num_elems,
478  for_window_framing);
479 }
480 
482  int32_t* buff,
483  const int64_t* composite_key_dict,
484  const int64_t hash_entry_count,
485  const BoundingBoxIntersectKeyHandler* key_handler,
486  const int64_t num_elems) {
487  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
488  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
489 }
490 
492  int32_t* buff,
493  const int64_t* composite_key_dict,
494  const size_t hash_entry_count,
495  const RangeKeyHandler* key_handler,
496  const size_t num_elems) {
497  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
498  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
499 }
500 
502  uint8_t* hll_buffer,
503  const uint32_t b,
504  int32_t* row_counts_buffer,
505  const BoundingBoxIntersectKeyHandler* key_handler,
506  const int64_t num_elems) {
508  approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,
509  hll_buffer,
510  row_counts_buffer,
511  b,
512  num_elems,
513  key_handler);
514 
515  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
517  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
518 }
519 
521  const uint32_t b,
522  int32_t* row_counts_buffer,
523  const RangeKeyHandler* key_handler,
524  const size_t num_elems,
525  const size_t block_size_x,
526  const size_t grid_size_x) {
527  auto qe_cuda_stream = getQueryEngineCudaStream();
528  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
529  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
530  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
531 
532  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
534  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
535 }
536 
537 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
538  const uint32_t b,
539  const GenericKeyHandler* key_handler,
540  const int64_t num_elems) {
541  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
542  hll_buffer,
543  nullptr,
544  b,
545  num_elems,
546  key_handler);
547 }
548 
549 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
550  const JoinColumn* join_column,
551  const JoinColumnTypeInfo* type_info,
552  const double* bucket_sz_threshold) {
553  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
554  bucket_sizes_buffer,
555  join_column,
556  type_info,
557  bucket_sz_threshold);
558 }
void fill_hash_join_buff_on_device_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
void fill_one_to_many_hash_table_on_device(OneToManyPerfectJoinHashTableFillFuncArgs const args)
void * CUstream
Definition: nocuda.h:23
void fill_hash_join_buff_on_device(OneToOnePerfectJoinHashTableFillFuncArgs const args)
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
#define SUFFIX(name)
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void fill_hash_join_buff_bucketized_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
DEVICE int SUFFIX() fill_hash_join_buff(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
#define VALID_POS_FLAG
const BucketizedHashEntryInfo hash_entry_info
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_bbox_intersect(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
size_t getNormalizedHashEntryCount() const
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_hash_join_buff_on_device_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void fill_hash_join_buff_wrapper_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void fill_one_to_many_hash_table_on_device_sharded(OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
size_t bucketized_hash_entry_count
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
__global__ void fill_hash_join_buff_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void fill_one_to_many_hash_table_on_device_bucketized(OneToManyPerfectJoinHashTableFillFuncArgs const args)