22 #include <thrust/device_ptr.h>
23 #include <thrust/scan.h>
25 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
27 template <
typename F,
typename... ARGS>
31 checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
33 func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(
args)...);
42 int partial_err = fill_hash_join_buff_func(args, -1, -1);
109 const int64_t hash_entry_count,
110 const int32_t invalid_slot_val) {
115 const int64_t hash_entry_count,
116 const int32_t invalid_slot_val) {
121 #define VALID_POS_FLAG 0
124 const int32_t* count_buff,
125 const int64_t entry_count) {
126 const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
127 const int32_t step = blockDim.x * gridDim.x;
128 for (int64_t i = start; i < entry_count; i += step) {
137 const int64_t entry_count) {
138 const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
139 const int32_t step = blockDim.x * gridDim.x;
140 for (int64_t i = start; i < entry_count; i += step) {
142 pos_buff[i] = !i ? 0 : count_buff[i - 1];
147 template <
typename COUNT_MATCHES_FUNCTOR,
typename FILL_ROW_IDS_FUNCTOR>
149 const int64_t hash_entry_count,
152 COUNT_MATCHES_FUNCTOR count_matches_func,
153 FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
154 int32_t* pos_buff = buff;
155 int32_t* count_buff = buff + hash_entry_count;
158 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
160 count_matches_func();
164 auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
166 count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
170 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
177 auto buff = args.
buff;
179 auto count_matches_func = [count_buff = buff + hash_entry_count, &
args] {
183 auto fill_row_ids_func = [buff, hash_entry_count, &
args] {
202 auto const buff = args.
buff;
203 auto count_matches_func = [count_buff = buff + hash_entry_count, &
args] {
210 auto fill_row_ids_func = [buff, hash_entry_count, &
args] {
230 int32_t* pos_buff = args.
buff;
231 int32_t* count_buff = args.
buff + hash_entry_count;
234 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
244 auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
246 count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
249 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
259 template <
typename T,
typename KEY_HANDLER>
261 const T* composite_key_dict,
262 const int64_t hash_entry_count,
263 const KEY_HANDLER* key_handler,
264 const size_t num_elems,
265 const bool for_window_framing) {
266 auto pos_buff = buff;
267 auto count_buff = buff + hash_entry_count;
270 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
281 auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
283 count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
286 cudaMemsetAsync(count_buff, 0, hash_entry_count *
sizeof(int32_t), qe_cuda_stream));
298 template <
typename T>
300 const int64_t entry_count,
301 const size_t key_component_count,
302 const bool with_val_slot,
303 const int32_t invalid_slot_val) {
314 const int64_t entry_count,
315 const size_t key_component_count,
316 const bool with_val_slot,
317 const int32_t invalid_slot_val) {
327 const int64_t entry_count,
328 const size_t key_component_count,
329 const bool with_val_slot,
330 const int32_t invalid_slot_val) {
339 template <
typename T,
typename KEY_HANDLER>
341 const int64_t entry_count,
342 const int32_t invalid_slot_val,
343 const bool for_semi_join,
344 const size_t key_component_count,
345 const bool with_val_slot,
347 const KEY_HANDLER* key_handler,
348 const int64_t num_elems) {
359 atomicCAS(err, 0, partial_err);
363 const int64_t entry_count,
364 const int32_t invalid_slot_val,
365 const bool for_semi_join,
366 const size_t key_component_count,
367 const bool with_val_slot,
370 const int64_t num_elems) {
372 fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
385 const int64_t entry_count,
386 const int32_t invalid_slot_val,
387 const bool for_semi_join,
388 const size_t key_component_count,
389 const bool with_val_slot,
392 const int64_t num_elems) {
394 fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
408 const int64_t entry_count,
409 const int32_t invalid_slot_val,
410 const size_t key_component_count,
411 const bool with_val_slot,
414 const int64_t num_elems) {
430 const int64_t entry_count,
431 const int32_t invalid_slot_val,
432 const size_t key_component_count,
433 const bool with_val_slot,
436 const size_t num_elems) {
438 fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
452 const int32_t* composite_key_dict,
453 const int64_t hash_entry_count,
454 const size_t key_component_count,
456 const int64_t num_elems,
457 const bool for_window_framing) {
458 fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
468 const int64_t* composite_key_dict,
469 const int64_t hash_entry_count,
471 const int64_t num_elems,
472 const bool for_window_framing) {
473 fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
483 const int64_t* composite_key_dict,
484 const int64_t hash_entry_count,
486 const int64_t num_elems) {
487 fill_one_to_many_baseline_hash_table_on_device<int64_t>(
488 buff, composite_key_dict, hash_entry_count, key_handler, num_elems,
false);
493 const int64_t* composite_key_dict,
494 const size_t hash_entry_count,
496 const size_t num_elems) {
497 fill_one_to_many_baseline_hash_table_on_device<int64_t>(
498 buff, composite_key_dict, hash_entry_count, key_handler, num_elems,
false);
504 int32_t* row_counts_buffer,
506 const int64_t num_elems) {
508 approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,
515 auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
517 row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
522 int32_t* row_counts_buffer,
524 const size_t num_elems,
525 const size_t block_size_x,
526 const size_t grid_size_x) {
528 approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
529 hll_buffer, row_counts_buffer, b, num_elems, key_handler);
532 auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
534 row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
540 const int64_t num_elems) {
552 const double* bucket_sz_threshold) {
557 bucket_sz_threshold);
const bool for_window_framing
void fill_hash_join_buff_on_device_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
const JoinColumnTypeInfo type_info
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
const int64_t bucket_normalization
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
void fill_one_to_many_hash_table_on_device(OneToManyPerfectJoinHashTableFillFuncArgs const args)
void fill_hash_join_buff_on_device(OneToOnePerfectJoinHashTableFillFuncArgs const args)
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
const int64_t bucket_normalization
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
const JoinColumn join_column
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void fill_hash_join_buff_bucketized_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
DEVICE int SUFFIX() fill_hash_join_buff(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
const JoinColumn join_column
const BucketizedHashEntryInfo hash_entry_info
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
CUstream getQueryEngineCudaStream()
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_bbox_intersect(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
size_t getNormalizedHashEntryCount() const
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_hash_join_buff_on_device_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define checkCudaErrors(err)
const int32_t invalid_slot_val
const JoinColumnTypeInfo type_info
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void fill_hash_join_buff_wrapper_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void fill_one_to_many_hash_table_on_device_sharded(OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
size_t bucketized_hash_entry_count
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
__global__ void fill_hash_join_buff_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void fill_one_to_many_hash_table_on_device_bucketized(OneToManyPerfectJoinHashTableFillFuncArgs const args)