OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.h File Reference
#include <cstddef>
#include <cstdint>
#include <vector>
#include "../../../Shared/SqlTypesLayout.h"
#include "../../../Shared/sqltypes.h"
#include "../../RuntimeFunctions.h"
#include "../../../Shared/funcannotations.h"
+ Include dependency graph for HashJoinRuntime.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  BucketizedHashEntryInfo
 
struct  JoinChunk
 
struct  JoinColumn
 
struct  JoinColumnTypeInfo
 
struct  JoinBucketInfo
 
struct  ShardInfo
 
struct  OneToOnePerfectJoinHashTableFillFuncArgs
 
struct  OneToManyPerfectJoinHashTableFillFuncArgs
 

Enumerations

enum  ColumnType { SmallDate = 0, Signed = 1, Unsigned = 2, Double = 3 }
 

Functions

void init_hash_join_buff (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
ColumnType get_join_column_type_kind (const SQLTypeInfo &ti)
 
int fill_hash_join_buff_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
 
int fill_hash_join_buff (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
 
int fill_hash_join_buff_bitwise_eq (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
 
void fill_hash_join_buff_on_device (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
void fill_hash_join_buff_on_device_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
void fill_hash_join_buff_on_device_sharded (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
void fill_hash_join_buff_on_device_sharded_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
void fill_one_to_many_hash_table (OneToManyPerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_count)
 
void fill_one_to_many_hash_table_on_device (OneToManyPerfectJoinHashTableFillFuncArgs const args)
 
void fill_one_to_many_hash_table_on_device_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args)
 
void fill_one_to_many_hash_table_on_device_sharded (OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int bbox_intersect_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int bbox_intersect_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false, const bool for_window_framing=false)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false, const bool for_window_framing=false)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_bbox_intersect (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_bbox_intersect (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
 

Variables

const size_t g_maximum_conditions_to_coalesce {8}
 

Enumeration Type Documentation

enum ColumnType
Enumerator
SmallDate 
Signed 
Unsigned 
Double 

Definition at line 120 of file HashJoinRuntime.h.

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2247 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2252  {
2253  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2254  CHECK(!join_column_per_key.empty());
2255 
2256  std::vector<std::future<void>> approx_distinct_threads;
2257  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2258  approx_distinct_threads.push_back(std::async(
2260  [&join_column_per_key,
2261  &type_info_per_key,
2262  b,
2263  hll_buffer_all_cpus,
2264  padded_size_bytes,
2265  thread_idx,
2266  thread_count] {
2267  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2268 
2269  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2270  false,
2271  &join_column_per_key[0],
2272  &type_info_per_key[0],
2273  nullptr,
2274  nullptr);
2276  nullptr,
2277  b,
2278  join_column_per_key[0].num_elems,
2279  &key_handler,
2280  thread_idx,
2281  thread_count);
2282  }));
2283  }
2284  for (auto& child : approx_distinct_threads) {
2285  child.get();
2286  }
2287 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_bbox_intersect ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2289 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

2297  {
2298  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2299  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2300  CHECK(!join_column_per_key.empty());
2301 
2302  std::vector<std::future<void>> approx_distinct_threads;
2303  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2304  approx_distinct_threads.push_back(std::async(
2306  [&join_column_per_key,
2307  &join_buckets_per_key,
2308  &row_counts,
2309  b,
2310  hll_buffer_all_cpus,
2311  padded_size_bytes,
2312  thread_idx,
2313  thread_count] {
2314  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2315 
2316  const auto key_handler = BoundingBoxIntersectKeyHandler(
2317  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2318  &join_column_per_key[0],
2319  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2321  row_counts.data(),
2322  b,
2323  join_column_per_key[0].num_elems,
2324  &key_handler,
2325  thread_idx,
2326  thread_count);
2327  }));
2328  }
2329  for (auto& child : approx_distinct_threads) {
2330  child.get();
2331  }
2332 
2334  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2335 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 537 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

540  {
541  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
542  hll_buffer,
543  nullptr,
544  b,
545  num_elems,
546  key_handler);
547 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_bbox_intersect ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 501 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

506  {
508  approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,
509  hll_buffer,
510  row_counts_buffer,
511  b,
512  num_elems,
513  key_handler);
514 
515  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
517  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
518 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 520 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

526  {
527  auto qe_cuda_stream = getQueryEngineCudaStream();
528  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
529  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
530  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
531 
532  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
534  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
535 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2337 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2346  {
2347  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2348  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2349  CHECK(!join_column_per_key.empty());
2350 
2351  std::vector<std::future<void>> approx_distinct_threads;
2352  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2353  approx_distinct_threads.push_back(std::async(
2355  [&join_column_per_key,
2356  &join_buckets_per_key,
2357  &row_counts,
2358  b,
2359  hll_buffer_all_cpus,
2360  padded_size_bytes,
2361  thread_idx,
2362  is_compressed,
2363  thread_count] {
2364  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2365 
2366  const auto key_handler = RangeKeyHandler(
2367  is_compressed,
2368  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2369  &join_column_per_key[0],
2370  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2372  row_counts.data(),
2373  b,
2374  join_column_per_key[0].num_elems,
2375  &key_handler,
2376  thread_idx,
2377  thread_count);
2378  }));
2379  }
2380  for (auto& child : approx_distinct_threads) {
2381  child.get();
2382  }
2383 
2385  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2386 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1866 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1875  {
1876  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1877  entry_count,
1878  invalid_slot_val,
1879  false,
1880  key_component_count,
1881  with_val_slot,
1882  key_handler,
1883  num_elems,
1884  cpu_thread_idx,
1885  cpu_thread_count);
1886 }

+ Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1931 of file HashJoinRuntime.cpp.

1940  {
1941  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1942  entry_count,
1943  invalid_slot_val,
1944  false,
1945  key_component_count,
1946  with_val_slot,
1947  key_handler,
1948  num_elems,
1949  cpu_thread_idx,
1950  cpu_thread_count);
1951 }
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 406 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_baseline_hash_join_buff_wrapper().

414  {
416  fill_baseline_hash_join_buff_wrapper<unsigned long long,
418  hash_buff,
419  entry_count,
420  invalid_slot_val,
421  false,
422  key_component_count,
423  with_val_slot,
424  dev_err_buff,
425  key_handler,
426  num_elems);
427 }
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 481 of file HashJoinRuntimeGpu.cu.

486  {
487  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
488  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
489 }
void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2388 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

2392  {
2393  std::vector<std::vector<double>> bucket_sizes_for_threads;
2394  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2395  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2396  }
2397  std::vector<std::future<void>> threads;
2398  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2399  threads.push_back(std::async(std::launch::async,
2400  compute_bucket_sizes_impl<2>,
2401  bucket_sizes_for_threads[thread_idx].data(),
2402  &join_column,
2403  &type_info,
2404  bucket_size_thresholds.data(),
2405  thread_idx,
2406  thread_count));
2407  }
2408  for (auto& child : threads) {
2409  child.get();
2410  }
2411 
2412  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2413  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2414  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2415  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2416  }
2417  }
2418  }
2419 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds 
)

Definition at line 549 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

552  {
553  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
554  bucket_sizes_buffer,
555  join_column,
556  type_info,
557  bucket_sz_threshold);
558 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1844 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1853  {
1854  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1855  entry_count,
1856  invalid_slot_val,
1857  for_semi_join,
1858  key_component_count,
1859  with_val_slot,
1860  key_handler,
1861  num_elems,
1862  cpu_thread_idx,
1863  cpu_thread_count);
1864 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1909 of file HashJoinRuntime.cpp.

1918  {
1919  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1920  entry_count,
1921  invalid_slot_val,
1922  for_semi_join,
1923  key_component_count,
1924  with_val_slot,
1925  key_handler,
1926  num_elems,
1927  cpu_thread_idx,
1928  cpu_thread_count);
1929 }
void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 362 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

370  {
372  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
373  hash_buff,
374  entry_count,
375  invalid_slot_val,
376  for_semi_join,
377  key_component_count,
378  with_val_slot,
379  dev_err_buff,
380  key_handler,
381  num_elems);
382 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 384 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

392  {
394  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
395  hash_buff,
396  entry_count,
397  invalid_slot_val,
398  for_semi_join,
399  key_component_count,
400  with_val_slot,
401  dev_err_buff,
402  key_handler,
403  num_elems);
404 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

int fill_hash_join_buff ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_idx,
int32_t const  cpu_thread_count 
)

Definition at line 203 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

206  {
207  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
209  auto hashtable_filling_func = [&](auto elem, size_t index) {
210  auto entry_ptr = SUFFIX(get_hash_slot)(args.buff, elem, args.type_info.min_val);
211  return filling_func(index, entry_ptr, args.invalid_slot_val);
212  };
213 
215  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
216 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_hash_join_buff_bitwise_eq ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_idx,
int32_t const  cpu_thread_count 
)

Definition at line 187 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_bitwise_eq(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

190  {
191  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
193  auto hashtable_filling_func = [&](auto elem, size_t index) {
194  auto entry_ptr = SUFFIX(get_hash_slot_bitwise_eq)(
195  args.buff, elem, args.type_info.min_val, args.type_info.translated_null_val);
196  return filling_func(index, entry_ptr, args.invalid_slot_val);
197  };
198 
200  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
201 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_bitwise_eq(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val)
Definition: JoinHashImpl.h:82
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_hash_join_buff_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_idx,
int32_t const  cpu_thread_count 
)

Definition at line 167 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

170  {
171  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
173  auto hashtable_filling_func = [&](auto elem, size_t index) {
174  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
175  args.buff,
176  elem,
177  args.type_info.min_val / args.bucket_normalization,
178  args.type_info.translated_null_val,
179  args.bucket_normalization);
180  return filling_func(index, entry_ptr, args.invalid_slot_val);
181  };
182 
184  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
185 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 57 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

57  {
59 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void fill_hash_join_buff_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 52 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

53  {
55 }
__global__ void fill_hash_join_buff_bucketized_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 102 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

104  {
106 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void fill_hash_join_buff_wrapper_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 95 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

97  {
100 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false,
const bool  for_window_framing = false 
)

Definition at line 2189 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2202  {
2203  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2204  composite_key_dict,
2205  hash_entry_count,
2206  key_component_count,
2207  join_column_per_key,
2208  type_info_per_key,
2209  join_bucket_info,
2210  sd_inner_to_outer_translation_maps,
2211  sd_min_inner_elems,
2212  cpu_thread_count,
2213  is_range_join,
2214  is_geo_compressed,
2215  for_window_framing);
2216 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false,
const bool  for_window_framing = false 
)

Definition at line 2218 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2231  {
2232  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2233  composite_key_dict,
2234  hash_entry_count,
2235  key_component_count,
2236  join_column_per_key,
2237  type_info_per_key,
2238  join_bucket_info,
2239  sd_inner_to_outer_translation_maps,
2240  sd_min_inner_elems,
2241  cpu_thread_count,
2242  is_range_join,
2243  is_geo_compressed,
2244  for_window_framing);
2245 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 450 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

457  {
458  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
459  composite_key_dict,
460  hash_entry_count,
461  key_handler,
462  num_elems,
463  for_window_framing);
464 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

472  {
473  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
474  composite_key_dict,
475  hash_entry_count,
476  key_handler,
477  num_elems,
478  for_window_framing);
479 }
void fill_one_to_many_hash_table ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_count 
)

Definition at line 1564 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1565  {
1566  auto timer = DEBUG_TIMER(__func__);
1567  auto const buff = args.buff;
1568  auto const hash_entry_info = args.hash_entry_info;
1569  auto launch_count_matches = [count_buff =
1570  buff + hash_entry_info.bucketized_hash_entry_count,
1571  &args](auto cpu_thread_idx, auto cpu_thread_count) {
1573  (count_buff,
1574  args.join_column,
1575  args.type_info,
1576  args.sd_inner_to_outer_translation_map,
1577  args.min_inner_elem,
1578  cpu_thread_idx,
1579  cpu_thread_count);
1580  };
1581  auto launch_fill_row_ids =
1582  [hash_entry_count = hash_entry_info.bucketized_hash_entry_count, buff, args](
1583  auto cpu_thread_idx, auto cpu_thread_count) {
1585  (buff,
1586  hash_entry_count,
1587  args.join_column,
1588  args.type_info,
1589  args.for_window_framing,
1590  args.sd_inner_to_outer_translation_map,
1591  args.min_inner_elem,
1592  cpu_thread_idx,
1593  cpu_thread_count);
1594  };
1595 
1597  hash_entry_info.bucketized_hash_entry_count,
1598  args.join_column,
1599  args.type_info,
1600  args.sd_inner_to_outer_translation_map,
1601  args.min_inner_elem,
1602  cpu_thread_count,
1603  args.for_window_framing,
1604  launch_count_matches,
1605  launch_fill_row_ids);
1606 }
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:412
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_count 
)

Definition at line 1608 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1610  {
1611  auto timer = DEBUG_TIMER(__func__);
1612  auto const buff = args.buff;
1613  auto const hash_entry_info = args.hash_entry_info;
1614  auto bucket_normalization = hash_entry_info.bucket_normalization;
1615  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1616  auto launch_count_matches = [bucket_normalization,
1617  count_buff = buff + hash_entry_count,
1618  &args](auto cpu_thread_idx, auto cpu_thread_count) {
1620  (count_buff,
1621  args.join_column,
1622  args.type_info,
1623  args.sd_inner_to_outer_translation_map,
1624  args.min_inner_elem,
1625  cpu_thread_idx,
1626  cpu_thread_count,
1627  bucket_normalization);
1628  };
1629  auto launch_fill_row_ids = [bucket_normalization, hash_entry_count, buff, args](
1630  auto cpu_thread_idx, auto cpu_thread_count) {
1632  (buff,
1633  hash_entry_count,
1634  args.join_column,
1635  args.type_info,
1636  args.sd_inner_to_outer_translation_map,
1637  args.min_inner_elem,
1638  cpu_thread_idx,
1639  cpu_thread_count,
1640  bucket_normalization);
1641  };
1642 
1644  hash_entry_count,
1645  args.join_column,
1646  args.type_info,
1647  args.sd_inner_to_outer_translation_map,
1648  args.min_inner_elem,
1649  cpu_thread_count,
1650  false,
1651  launch_count_matches,
1652  launch_fill_row_ids);
1653 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define DEBUG_TIMER(name)
Definition: Logger.h:412
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device ( OneToManyPerfectJoinHashTableFillFuncArgs const  args)

Definition at line 175 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

176  {
177  auto buff = args.buff;
178  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
179  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
181  SUFFIX(count_matches), count_buff, args.join_column, args.type_info);
182  };
183  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
185  buff,
186  hash_entry_count,
187  args.join_column,
188  args.type_info,
189  args.for_window_framing);
190  };
192  hash_entry_count,
193  args.join_column,
194  args.type_info,
195  count_matches_func,
196  fill_row_ids_func);
197 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( OneToManyPerfectJoinHashTableFillFuncArgs const  args)

Definition at line 199 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

200  {
201  auto hash_entry_count = args.hash_entry_info.getNormalizedHashEntryCount();
202  auto const buff = args.buff;
203  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
205  count_buff,
206  args.join_column,
207  args.type_info,
208  args.bucket_normalization);
209  };
210  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
212  buff,
213  hash_entry_count,
214  args.join_column,
215  args.type_info,
216  args.bucket_normalization);
217  };
219  hash_entry_count,
220  args.join_column,
221  args.type_info,
222  count_matches_func,
223  fill_row_ids_func);
224 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 226 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, inclusive_scan(), OneToManyPerfectJoinHashTableFillFuncArgs::join_column, set_valid_pos(), set_valid_pos_flag(), SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

228  {
229  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
230  int32_t* pos_buff = args.buff;
231  int32_t* count_buff = args.buff + hash_entry_count;
232  auto qe_cuda_stream = getQueryEngineCudaStream();
234  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
235  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
237  count_buff,
238  args.join_column,
239  args.type_info,
240  shard_info);
241 
242  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
243 
244  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
246  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
247  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
249  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
250  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
252  args.buff,
253  hash_entry_count,
254  args.join_column,
255  args.type_info,
256  shard_info);
257 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

ColumnType get_join_column_type_kind ( const SQLTypeInfo ti)
inline

Definition at line 147 of file HashJoinRuntime.h.

References SQLTypeInfo::is_date_in_days(), is_unsigned_type(), Signed, SmallDate, and Unsigned.

Referenced by BoundingBoxIntersectJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTable::fetchColumnsForDevice(), BaselineJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

147  {
148  if (ti.is_date_in_days()) {
149  return SmallDate;
150  } else {
151  return is_unsigned_type(ti) ? Unsigned : Signed;
152  }
153 }
bool is_date_in_days() const
Definition: sqltypes.h:1018
bool is_unsigned_type(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1788 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1794  {
1795  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1796  entry_count,
1797  key_component_count,
1798  with_val_slot,
1799  invalid_slot_val,
1800  cpu_thread_idx,
1801  cpu_thread_count);
1802 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1804 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1810  {
1811  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1812  entry_count,
1813  key_component_count,
1814  with_val_slot,
1815  invalid_slot_val,
1816  cpu_thread_idx,
1817  cpu_thread_count);
1818 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 313 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

317  {
318  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
319  hash_join_buff,
320  entry_count,
321  key_component_count,
322  with_val_slot,
323  invalid_slot_val);
324 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 326 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

330  {
331  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
332  hash_join_buff,
333  entry_count,
334  key_component_count,
335  with_val_slot,
336  invalid_slot_val);
337 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_hash_join_buff ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 71 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

75  {
76 #ifdef __CUDACC__
77  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
78  int32_t step = blockDim.x * gridDim.x;
79 #else
80  int32_t start = cpu_thread_idx;
81  int32_t step = cpu_thread_count;
82 #endif
83  for (int64_t i = start; i < hash_entry_count; i += step) {
84  groups_buffer[i] = invalid_slot_val;
85  }
86 }

+ Here is the caller graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 114 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

116  {
118  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
119 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1888 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1896  {
1897  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1898  entry_count,
1899  invalid_slot_val,
1900  false,
1901  key_component_count,
1902  with_val_slot,
1903  key_handler,
1904  num_elems,
1905  cpu_thread_idx,
1906  cpu_thread_count);
1907 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1953 of file HashJoinRuntime.cpp.

1961  {
1962  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1963  entry_count,
1964  invalid_slot_val,
1965  false,
1966  key_component_count,
1967  with_val_slot,
1968  key_handler,
1969  num_elems,
1970  cpu_thread_idx,
1971  cpu_thread_count);
1972 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 429 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

436  {
438  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
439  hash_buff,
440  entry_count,
441  invalid_slot_val,
442  false,
443  key_component_count,
444  with_val_slot,
445  dev_err_buff,
446  key_handler,
447  num_elems);
448 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 491 of file HashJoinRuntimeGpu.cu.

496  {
497  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
498  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
499 }

Variable Documentation