#include "HashJoinRuntime.cpp"
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>

Include dependency graph for HashJoinRuntimeGpu.cu:

Macros
#define	checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)

#define	VALID_POS_FLAG 0

Functions
CUstream	getQueryEngineCudaStream ()

template<typename F , typename... ARGS>
void	cuda_kernel_launch_wrapper (F func, ARGS &&...args)

__global__ void	fill_hash_join_buff_wrapper (OneToOnePerfectJoinHashTableFillFuncArgs const args)

__global__ void	fill_hash_join_buff_bucketized_wrapper (OneToOnePerfectJoinHashTableFillFuncArgs const args)

void	fill_hash_join_buff_on_device_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args)

void	fill_hash_join_buff_on_device (OneToOnePerfectJoinHashTableFillFuncArgs const args)

__global__ void	fill_hash_join_buff_wrapper_sharded_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

__global__ void	fill_hash_join_buff_wrapper_sharded (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

void	fill_hash_join_buff_on_device_sharded_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

void	fill_hash_join_buff_on_device_sharded (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

__global__ void	init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

void	init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

__global__ void	set_valid_pos_flag (int32_t pos_buff, const int32_t count_buff, const int64_t entry_count)

__global__ void	set_valid_pos (int32_t pos_buff, int32_t count_buff, const int64_t entry_count)

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void	fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)

void	fill_one_to_many_hash_table_on_device (OneToManyPerfectJoinHashTableFillFuncArgs const args)

void	fill_one_to_many_hash_table_on_device_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args)

void	fill_one_to_many_hash_table_on_device_sharded (OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

template<typename T , typename KEY_HANDLER >
void	fill_one_to_many_baseline_hash_table_on_device (int32_t buff, const T composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)

template<typename T >
__global__ void	init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)

void	init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)

void	init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)

template<typename T , typename KEY_HANDLER >
__global__ void	fill_baseline_hash_join_buff_wrapper (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int err, const KEY_HANDLER *key_handler, const int64_t num_elems)

void	fill_baseline_hash_join_buff_on_device_32 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)

void	fill_baseline_hash_join_buff_on_device_64 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)

void	bbox_intersect_fill_baseline_hash_join_buff_on_device_64 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)

void	range_fill_baseline_hash_join_buff_on_device_64 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)

void	fill_one_to_many_baseline_hash_table_on_device_32 (int32_t buff, const int32_t composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)

void	fill_one_to_many_baseline_hash_table_on_device_64 (int32_t buff, const int64_t composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)

void	bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t buff, const int64_t composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)

void	range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t buff, const int64_t composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)

void	approximate_distinct_tuples_on_device_bbox_intersect (uint8_t hll_buffer, const uint32_t b, int32_t row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)

void	approximate_distinct_tuples_on_device_range (uint8_t hll_buffer, const uint32_t b, int32_t row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)

void	approximate_distinct_tuples_on_device (uint8_t hll_buffer, const uint32_t b, const GenericKeyHandler key_handler, const int64_t num_elems)

void	compute_bucket_sizes_on_device (double bucket_sizes_buffer, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const double bucket_sz_threshold)

Macro Definition Documentation

#define checkCudaErrors ( err ) CHECK_EQ(err, cudaSuccess)

Definition at line 25 of file HashJoinRuntimeGpu.cu.

#define VALID_POS_FLAG 0

Definition at line 121 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device	(	uint8_t *	hll_buffer,
		const uint32_t	b,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 537 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

                                                                     {
   cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
                              hll_buffer,
                              nullptr,
                              b,
                              num_elems,
                              key_handler);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_bbox_intersect	(	uint8_t *	hll_buffer,
		const uint32_t	b,
		int32_t *	row_counts_buffer,
		const BoundingBoxIntersectKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 501 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

                              {
   cuda_kernel_launch_wrapper(
       approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,
       hll_buffer,
       row_counts_buffer,
       b,
       num_elems,
       key_handler);
 
   auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
   thrust::inclusive_scan(
       row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range	(	uint8_t *	hll_buffer,
		const uint32_t	b,
		int32_t *	row_counts_buffer,
		const RangeKeyHandler *	key_handler,
		const size_t	num_elems,
		const size_t	block_size_x,
		const size_t	grid_size_x
	)

Definition at line 520 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

                                                                            {
   auto qe_cuda_stream = getQueryEngineCudaStream();
   approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
       hll_buffer, row_counts_buffer, b, num_elems, key_handler);
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
 
   auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
   thrust::inclusive_scan(
       row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void bbox_intersect_fill_baseline_hash_join_buff_on_device_64	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		int *	dev_err_buff,
		const BoundingBoxIntersectKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 406 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_baseline_hash_join_buff_wrapper().

                              {
   cuda_kernel_launch_wrapper(
       fill_baseline_hash_join_buff_wrapper<unsigned long long,
                                            BoundingBoxIntersectKeyHandler>,
       hash_buff,
       entry_count,
       invalid_slot_val,
       false,
       key_component_count,
       with_val_slot,
       dev_err_buff,
       key_handler,
       num_elems);
 }

Here is the call graph for this function:

void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64	(	int32_t *	buff,
		const int64_t *	composite_key_dict,
		const int64_t	hash_entry_count,
		const BoundingBoxIntersectKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 481 of file HashJoinRuntimeGpu.cu.

                              {
   fill_one_to_many_baseline_hash_table_on_device<int64_t>(
       buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
 }

void compute_bucket_sizes_on_device	(	double *	bucket_sizes_buffer,
		const JoinColumn *	join_column,
		const JoinColumnTypeInfo *	type_info,
		const double *	bucket_sz_threshold
	)

Definition at line 549 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

                                                                        {
   cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
                              bucket_sizes_buffer,
                              join_column,
                              type_info,
                              bucket_sz_threshold);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename F , typename... ARGS>

void cuda_kernel_launch_wrapper	(	F	func,
		ARGS &&...	args
	)

Definition at line 28 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, checkCudaErrors, and getQueryEngineCudaStream().

                                                         {
   int grid_size = -1;
   int block_size = -1;
   checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
   auto qe_cuda_stream = getQueryEngineCudaStream();
   func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_32	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		int *	dev_err_buff,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 362 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

                                                                         {
   cuda_kernel_launch_wrapper(
       fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
       hash_buff,
       entry_count,
       invalid_slot_val,
       for_semi_join,
       key_component_count,
       with_val_slot,
       dev_err_buff,
       key_handler,
       num_elems);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		int *	dev_err_buff,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems
	)

Definition at line 384 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

                                                                         {
   cuda_kernel_launch_wrapper(
       fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
       hash_buff,
       entry_count,
       invalid_slot_val,
       for_semi_join,
       key_component_count,
       with_val_slot,
       dev_err_buff,
       key_handler,
       num_elems);
 }

Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >

__global__ void fill_baseline_hash_join_buff_wrapper	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		int *	err,
		const KEY_HANDLER *	key_handler,
		const int64_t	num_elems
	)

Definition at line 340 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

Referenced by bbox_intersect_fill_baseline_hash_join_buff_on_device_64().

                                                                               {
   int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
                                                             entry_count,
                                                             invalid_slot_val,
                                                             for_semi_join,
                                                             key_component_count,
                                                             with_val_slot,
                                                             key_handler,
                                                             num_elems,
                                                             -1,
                                                             -1);
   atomicCAS(err, 0, partial_err);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( OneToOnePerfectJoinHashTableFillFuncArgs const args )

Definition at line 46 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

                                                          {
   int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(args, -1, -1);
   atomicCAS(args.dev_err_buff, 0, partial_err);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( OneToOnePerfectJoinHashTableFillFuncArgs const args )

Definition at line 57 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

                                                                                         {
   cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper, args);
 }

Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const args )

Definition at line 52 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

                                                          {
   cuda_kernel_launch_wrapper(fill_hash_join_buff_bucketized_wrapper, args);
 }

Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		ShardInfo const	shard_info
	)

Definition at line 102 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

                                 {
   cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper_sharded, args, shard_info);
 }

Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		ShardInfo const	shard_info
	)

Definition at line 95 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

                                 {
   cuda_kernel_launch_wrapper(
       fill_hash_join_buff_wrapper_sharded_bucketized, args, shard_info);
 }

Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( OneToOnePerfectJoinHashTableFillFuncArgs const args )

Definition at line 37 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff(), fill_hash_join_buff_bitwise_eq(), SUFFIX, OneToOnePerfectJoinHashTableFillFuncArgs::type_info, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_on_device().

                                                          {
   auto fill_hash_join_buff_func = args.type_info.uses_bw_eq
                                       ? SUFFIX(fill_hash_join_buff_bitwise_eq)
                                       : SUFFIX(fill_hash_join_buff);
   int partial_err = fill_hash_join_buff_func(args, -1, -1);
   atomicCAS(args.dev_err_buff, 0, partial_err);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		ShardInfo const	shard_info
	)

Definition at line 79 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::buff, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_sharded(), OneToOnePerfectJoinHashTableFillFuncArgs::for_semi_join, OneToOnePerfectJoinHashTableFillFuncArgs::invalid_slot_val, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff_on_device_sharded().

                                 {
   int partial_err = SUFFIX(fill_hash_join_buff_sharded)(args.buff,
                                                         args.invalid_slot_val,
                                                         args.for_semi_join,
                                                         args.join_column,
                                                         args.type_info,
                                                         shard_info,
                                                         NULL,
                                                         NULL,
                                                         -1,
                                                         -1);
   atomicCAS(args.dev_err_buff, 0, partial_err);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		ShardInfo const	shard_info
	)

Definition at line 61 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::bucket_normalization, OneToOnePerfectJoinHashTableFillFuncArgs::buff, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_sharded_bucketized(), OneToOnePerfectJoinHashTableFillFuncArgs::for_semi_join, OneToOnePerfectJoinHashTableFillFuncArgs::invalid_slot_val, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

                                 {
   int partial_err =
       SUFFIX(fill_hash_join_buff_sharded_bucketized)(args.buff,
                                                      args.invalid_slot_val,
                                                      args.for_semi_join,
                                                      args.join_column,
                                                      args.type_info,
                                                      shard_info,
                                                      NULL,
                                                      NULL,
                                                      -1,
                                                      -1,
                                                      args.bucket_normalization);
   atomicCAS(args.dev_err_buff, 0, partial_err);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >

void fill_one_to_many_baseline_hash_table_on_device	(	int32_t *	buff,
		const T *	composite_key_dict,
		const int64_t	hash_entry_count,
		const KEY_HANDLER *	key_handler,
		const size_t	num_elems,
		const bool	for_window_framing
	)

Definition at line 260 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

                                                                                    {
   auto pos_buff = buff;
   auto count_buff = buff + hash_entry_count;
   auto qe_cuda_stream = getQueryEngineCudaStream();
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
   cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
                              count_buff,
                              composite_key_dict,
                              hash_entry_count,
                              key_handler,
                              num_elems);
 
   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
 
   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
   thrust::inclusive_scan(
       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
 
   cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
                              buff,
                              composite_key_dict,
                              hash_entry_count,
                              key_handler,
                              num_elems,
                              for_window_framing);
 }

Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32	(	int32_t *	buff,
		const int32_t *	composite_key_dict,
		const int64_t	hash_entry_count,
		const size_t	key_component_count,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems,
		const bool	for_window_framing
	)

Definition at line 450 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

                                    {
   fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
                                                           composite_key_dict,
                                                           hash_entry_count,
                                                           key_handler,
                                                           num_elems,
                                                           for_window_framing);
 }

Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64	(	int32_t *	buff,
		const int64_t *	composite_key_dict,
		const int64_t	hash_entry_count,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems,
		const bool	for_window_framing
	)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

                                    {
   fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
                                                           composite_key_dict,
                                                           hash_entry_count,
                                                           key_handler,
                                                           num_elems,
                                                           for_window_framing);
 }

void fill_one_to_many_hash_table_on_device ( OneToManyPerfectJoinHashTableFillFuncArgs const args )

Definition at line 175 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

                                                           {
   auto buff = args.buff;
   auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
   auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
     cuda_kernel_launch_wrapper(
         SUFFIX(count_matches), count_buff, args.join_column, args.type_info);
   };
   auto fill_row_ids_func = [buff, hash_entry_count, &args] {
     cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids),
                                buff,
                                hash_entry_count,
                                args.join_column,
                                args.type_info,
                                args.for_window_framing);
   };
   fill_one_to_many_hash_table_on_device_impl(buff,
                                              hash_entry_count,
                                              args.join_column,
                                              args.type_info,
                                              count_matches_func,
                                              fill_row_ids_func);
 }

Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( OneToManyPerfectJoinHashTableFillFuncArgs const args )

Definition at line 199 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

                                                           {
   auto hash_entry_count = args.hash_entry_info.getNormalizedHashEntryCount();
   auto const buff = args.buff;
   auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
     cuda_kernel_launch_wrapper(SUFFIX(count_matches_bucketized),
                                count_buff,
                                args.join_column,
                                args.type_info,
                                args.bucket_normalization);
   };
   auto fill_row_ids_func = [buff, hash_entry_count, &args] {
     cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_bucketized),
                                buff,
                                hash_entry_count,
                                args.join_column,
                                args.type_info,
                                args.bucket_normalization);
   };
   fill_one_to_many_hash_table_on_device_impl(buff,
                                              hash_entry_count,
                                              args.join_column,
                                              args.type_info,
                                              count_matches_func,
                                              fill_row_ids_func);
 }

Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >

void fill_one_to_many_hash_table_on_device_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn &	join_column,
		const JoinColumnTypeInfo &	type_info,
		COUNT_MATCHES_FUNCTOR	count_matches_func,
		FILL_ROW_IDS_FUNCTOR	fill_row_ids_func
	)

Definition at line 148 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

                                                                                         {
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   auto qe_cuda_stream = getQueryEngineCudaStream();
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
   count_matches_func();
 
   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
 
   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
   thrust::inclusive_scan(
       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
 
   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
   fill_row_ids_func();
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded	(	OneToManyPerfectJoinHashTableFillFuncArgs const	args,
		ShardInfo const	shard_info
	)

Definition at line 226 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, inclusive_scan(), OneToManyPerfectJoinHashTableFillFuncArgs::join_column, set_valid_pos(), set_valid_pos_flag(), SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

                                 {
   auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
   int32_t* pos_buff = args.buff;
   int32_t* count_buff = args.buff + hash_entry_count;
   auto qe_cuda_stream = getQueryEngineCudaStream();
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
   cuda_kernel_launch_wrapper(SUFFIX(count_matches_sharded),
                              count_buff,
                              args.join_column,
                              args.type_info,
                              shard_info);
 
   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
 
   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
   thrust::inclusive_scan(
       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
   checkCudaErrors(
       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
   cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_sharded),
                              args.buff,
                              hash_entry_count,
                              args.join_column,
                              args.type_info,
                              shard_info);
 }

Here is the call graph for this function:

CUstream getQueryEngineCudaStream ( )

Definition at line 3 of file QueryEngine.cpp.

                                     {  // NOTE: CUstream is cudaStream_t
   return QueryEngine::getInstance()->getCudaStream();
 }

void init_baseline_hash_join_buff_on_device_32	(	int8_t *	hash_join_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val
	)

Definition at line 313 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

                                                                                {
   cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
                              hash_join_buff,
                              entry_count,
                              key_component_count,
                              with_val_slot,
                              invalid_slot_val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64	(	int8_t *	hash_join_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val
	)

Definition at line 326 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

                                                                                {
   cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
                              hash_join_buff,
                              entry_count,
                              key_component_count,
                              with_val_slot,
                              invalid_slot_val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

__global__ void init_baseline_hash_join_buff_wrapper	(	int8_t *	hash_join_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val
	)

Definition at line 299 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

                                                                                      {
   SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
                                           entry_count,
                                           key_component_count,
                                           with_val_slot,
                                           invalid_slot_val,
                                           -1,
                                           -1);
 }

Here is the call graph for this function:

void init_hash_join_buff_on_device	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const int32_t	invalid_slot_val
	)

Definition at line 114 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

                                                                    {
   cuda_kernel_launch_wrapper(
       init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const int32_t	invalid_slot_val
	)

Definition at line 108 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

                                                                             {
   SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void range_fill_baseline_hash_join_buff_on_device_64	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		int *	dev_err_buff,
		const RangeKeyHandler *	key_handler,
		const size_t	num_elems
	)

Definition at line 429 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

                                                                              {
   cuda_kernel_launch_wrapper(
       fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
       hash_buff,
       entry_count,
       invalid_slot_val,
       false,
       key_component_count,
       with_val_slot,
       dev_err_buff,
       key_handler,
       num_elems);
 }

Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64	(	int32_t *	buff,
		const int64_t *	composite_key_dict,
		const size_t	hash_entry_count,
		const RangeKeyHandler *	key_handler,
		const size_t	num_elems
	)

Definition at line 491 of file HashJoinRuntimeGpu.cu.

                             {
   fill_one_to_many_baseline_hash_table_on_device<int64_t>(
       buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
 }

__global__ void set_valid_pos	(	int32_t *	pos_buff,
		int32_t *	count_buff,
		const int64_t	entry_count
	)

Definition at line 135 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

                                                          {
   const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   const int32_t step = blockDim.x * gridDim.x;
   for (int64_t i = start; i < entry_count; i += step) {
     if (VALID_POS_FLAG == pos_buff[i]) {
       pos_buff[i] = !i ? 0 : count_buff[i - 1];
     }
   }
 }

Here is the caller graph for this function:

__global__ void set_valid_pos_flag	(	int32_t *	pos_buff,
		const int32_t *	count_buff,
		const int64_t	entry_count
	)

Definition at line 123 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

                                                               {
   const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   const int32_t step = blockDim.x * gridDim.x;
   for (int64_t i = start; i < entry_count; i += step) {
     if (count_buff[i]) {
       pos_buff[i] = VALID_POS_FLAG;
     }
   }
 }

Here is the caller graph for this function:

Macros

Functions

Macro Definition Documentation

Function Documentation