36 const Executor* executor,
41 const int64_t num_rows,
42 const std::vector<std::vector<const int8_t*>>& col_buffers,
43 const std::vector<std::vector<uint64_t>>& frag_offsets,
44 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
45 const bool output_columnar,
47 const size_t thread_idx,
49 : query_mem_desc_(query_mem_desc)
51 , device_type_(device_type)
52 , dispatch_mode_(dispatch_mode)
53 , row_set_mem_owner_(row_set_mem_owner)
54 , output_columnar_(output_columnar) {
56 auto data_mgr = executor->getDataMgr();
62 auto render_allocator_map = render_info && render_info->
isInSitu()
65 query_buffers_ = std::make_unique<QueryMemoryInitializer>(ra_exe_unit,
85 const size_t i)
const {
88 auto deinterleaved_query_mem_desc =
90 deinterleaved_query_mem_desc.setHasInterleavedBinsOnGpu(
false);
91 deinterleaved_query_mem_desc.useConsistentSlotWidthSize(8);
93 auto deinterleaved_result_set =
94 std::make_shared<ResultSet>(result_set->getTargetInfos(),
95 std::vector<ColumnLazyFetchInfo>{},
96 std::vector<std::vector<const int8_t*>>{},
97 std::vector<std::vector<int64_t>>{},
98 std::vector<int64_t>{},
102 deinterleaved_query_mem_desc,
106 auto deinterleaved_storage =
107 deinterleaved_result_set->allocateStorage(
executor_->plan_state_->init_agg_vals_);
108 auto deinterleaved_buffer =
109 reinterpret_cast<int64_t*
>(deinterleaved_storage->getUnderlyingBuffer());
110 const auto rows_ptr = result_set->getStorage()->getUnderlyingBuffer();
111 size_t deinterleaved_buffer_idx = 0;
113 auto do_work = [&](
const size_t bin_base_off) {
114 std::vector<int64_t> agg_vals(agg_col_count, 0);
116 &
executor_->plan_state_->init_agg_vals_[0],
117 agg_col_count *
sizeof(agg_vals[0]));
124 result_set->getTargetInfos(),
126 for (
size_t agg_idx = 0; agg_idx < agg_col_count;
127 ++agg_idx, ++deinterleaved_buffer_idx) {
128 deinterleaved_buffer[deinterleaved_buffer_idx] = agg_vals[agg_idx];
133 bin_idx < result_set->entryCount();
135 if (
UNLIKELY((bin_idx & 0xFFFF) == 0 &&
136 executor_->checkNonKernelTimeInterrupted())) {
137 throw std::runtime_error(
138 "Query execution has interrupted during result set reduction");
140 do_work(bin_base_off);
144 bin_idx < result_set->entryCount();
146 do_work(bin_base_off);
150 return deinterleaved_result_set;
162 std::vector<std::pair<ResultSetPtr, std::vector<size_t>>> results_per_sm;
164 const auto group_by_buffers_size =
query_buffers_->getNumBuffers();
166 const size_t expected_num_buffers = query_mem_desc.
hasVarlenOutput() ? 2 : 1;
167 CHECK_EQ(expected_num_buffers, group_by_buffers_size);
171 const size_t group_by_output_buffers_size =
173 for (
size_t i = 0; i < group_by_output_buffers_size; i += step) {
177 return executor_->reduceMultiDeviceResults(
192 for (
const auto err : error_codes) {
197 for (
const auto err : error_codes) {
210 const bool hoist_literals,
211 const std::vector<int8_t>& literal_buff,
212 std::vector<std::vector<const int8_t*>> col_buffers,
213 const std::vector<std::vector<int64_t>>& num_rows,
214 const std::vector<std::vector<uint64_t>>& frag_offsets,
215 const int32_t scan_limit,
217 const unsigned block_size_x,
218 const unsigned grid_size_x,
220 const size_t shared_memory_size,
222 const uint32_t num_tables,
223 const bool allow_runtime_interrupt,
224 const std::vector<int8_t*>& join_hash_tables,
226 bool optimize_cuda_block_and_grid_sizes) {
231 CHECK(compilation_context);
237 if (render_allocator_map) {
243 std::vector<int64_t*> out_vec;
244 uint32_t num_fragments = col_buffers.size();
245 std::vector<int32_t> error_codes(grid_size_x * block_size_x);
247 auto prepareClock = kernel->make_clock();
248 auto launchClock = kernel->make_clock();
249 auto finishClock = kernel->make_clock();
252 prepareClock->start();
256 kernel->initializeDynamicWatchdog(
261 if (allow_runtime_interrupt && !render_allocator) {
262 kernel->initializeRuntimeInterrupter(device_id);
282 const unsigned block_size_y = 1;
283 const unsigned block_size_z = 1;
284 const unsigned grid_size_y = 1;
285 const unsigned grid_size_z = 1;
286 const auto total_thread_count = block_size_x * grid_size_x;
287 const auto err_desc = kernel_params[
ERROR_CODE];
291 auto gpu_group_by_buffers =
303 const auto max_matched =
static_cast<int32_t
>(gpu_group_by_buffers.entry_count);
305 kernel_params[
MAX_MATCHED], &max_matched,
sizeof(max_matched));
307 kernel_params[
GROUPBY_BUF] = gpu_group_by_buffers.ptrs;
308 std::vector<void*> param_ptrs;
309 for (
auto& param : kernel_params) {
310 param_ptrs.push_back(¶m);
314 auto prepareTime = prepareClock->stop();
316 <<
": launchGpuCode: group-by prepare: " <<
std::to_string(prepareTime)
318 launchClock->start();
321 if (hoist_literals) {
322 VLOG(1) <<
"Launching(" << kernel->name() <<
") on device_id(" << device_id <<
')';
323 kernel->launch(grid_size_x,
331 optimize_cuda_block_and_grid_sizes);
333 param_ptrs.erase(param_ptrs.begin() +
LITERALS);
334 VLOG(1) <<
"Launching(" << kernel->name() <<
") on device_id(" << device_id <<
')';
335 kernel->launch(grid_size_x,
343 optimize_cuda_block_and_grid_sizes);
346 auto launchTime = launchClock->stop();
348 <<
": launchGpuCode: group-by cuLaunchKernel: "
350 finishClock->start();
353 gpu_allocator_->copyFromDevice(reinterpret_cast<int8_t*>(error_codes.data()),
354 reinterpret_cast<int8_t*>(err_desc),
355 error_codes.size() *
sizeof(error_codes[0]));
357 if (*error_code > 0) {
361 if (!render_allocator) {
365 gpu_group_by_buffers,
374 gpu_group_by_buffers,
377 }
catch (
const std::bad_alloc&) {
387 gpu_group_by_buffers,
392 size_t num_allocated_rows{0};
398 if (*error_code < 0) {
407 gpu_group_by_buffers,
413 if (num_allocated_rows) {
414 CHECK(ra_exe_unit.use_bump_allocator);
425 gpu_group_by_buffers,
435 std::vector<int8_t*> out_vec_dev_buffers;
436 const size_t agg_col_count{ra_exe_unit.
estimator ? size_t(1) : init_agg_vals.size()};
441 const auto num_results_per_agg_col =
442 shared_memory_size ? 1 : block_size_x * grid_size_x * num_fragments;
443 const auto output_buffer_size_per_agg = num_results_per_agg_col *
sizeof(int64_t);
449 for (
size_t i = 0; i < agg_col_count; ++i) {
450 int8_t* out_vec_dev_buffer =
451 num_fragments ?
gpu_allocator_->alloc(output_buffer_size_per_agg) :
nullptr;
452 out_vec_dev_buffers.push_back(out_vec_dev_buffer);
453 if (shared_memory_size) {
454 CHECK_EQ(output_buffer_size_per_agg,
size_t(8));
455 gpu_allocator_->copyToDevice(reinterpret_cast<int8_t*>(out_vec_dev_buffer),
456 reinterpret_cast<const int8_t*>(&init_agg_vals[i]),
457 output_buffer_size_per_agg);
461 auto out_vec_dev_ptr =
gpu_allocator_->alloc(agg_col_count *
sizeof(int8_t*));
463 reinterpret_cast<int8_t*>(out_vec_dev_buffers.data()),
464 agg_col_count *
sizeof(int8_t*));
466 std::vector<void*> param_ptrs;
467 for (
auto& param : kernel_params) {
468 param_ptrs.push_back(¶m);
472 auto prepareTime = prepareClock->stop();
475 <<
": launchGpuCode: prepare: " <<
std::to_string(prepareTime) <<
" ms";
476 launchClock->start();
479 if (hoist_literals) {
480 VLOG(1) <<
"Launching(" << kernel->name() <<
") on device_id(" << device_id <<
')';
481 kernel->launch(grid_size_x,
489 optimize_cuda_block_and_grid_sizes);
491 param_ptrs.erase(param_ptrs.begin() +
LITERALS);
492 VLOG(1) <<
"Launching(" << kernel->name() <<
") on device_id(" << device_id <<
')';
493 kernel->launch(grid_size_x,
501 optimize_cuda_block_and_grid_sizes);
505 auto launchTime = launchClock->stop();
507 <<
": launchGpuCode: cuLaunchKernel: " <<
std::to_string(launchTime)
509 finishClock->start();
513 &error_codes[0], err_desc, error_codes.size() *
sizeof(error_codes[0]));
515 if (*error_code > 0) {
523 for (
size_t i = 0; i < agg_col_count; ++i) {
524 int64_t* host_out_vec =
new int64_t[output_buffer_size_per_agg];
526 host_out_vec, out_vec_dev_buffers[i], output_buffer_size_per_agg);
527 out_vec.push_back(host_out_vec);
530 const auto count_distinct_bitmap_device_mem =
532 if (count_distinct_bitmap_device_mem) {
535 reinterpret_cast<void*
>(count_distinct_bitmap_device_mem),
539 const auto varlen_output_gpu_buf =
query_buffers_->getVarlenOutputPtr();
540 if (varlen_output_gpu_buf) {
542 const size_t varlen_output_buf_bytes =
547 reinterpret_cast<void*
>(varlen_output_gpu_buf),
548 varlen_output_buf_bytes);
552 if (allow_runtime_interrupt) {
553 kernel->resetRuntimeInterrupter(device_id);
555 auto finishTime = finishClock->stop();
557 <<
": launchGpuCode: finish: " <<
std::to_string(finishTime) <<
" ms";
566 const bool hoist_literals,
567 const std::vector<int8_t>& literal_buff,
568 std::vector<std::vector<const int8_t*>> col_buffers,
569 const std::vector<std::vector<int64_t>>& num_rows,
570 const std::vector<std::vector<uint64_t>>& frag_offsets,
571 const int32_t scan_limit,
573 const uint32_t start_rowid,
574 const uint32_t num_tables,
575 const std::vector<int8_t*>& join_hash_tables,
583 std::vector<const int8_t**> multifrag_col_buffers;
584 for (
auto& col_buffer : col_buffers) {
585 multifrag_col_buffers.push_back(col_buffer.empty() ?
nullptr : col_buffer.data());
587 const int8_t*** multifrag_cols_ptr{
588 multifrag_col_buffers.empty() ?
nullptr : &multifrag_col_buffers[0]};
589 const uint32_t num_fragments =
590 multifrag_cols_ptr ?
static_cast<uint32_t
>(col_buffers.size()) : uint32_t(0);
591 const auto num_out_frags = multifrag_cols_ptr ? num_fragments : uint32_t(0);
594 std::vector<int64_t*> out_vec;
606 for (
size_t i = 0; i < init_agg_vals.size(); ++i) {
607 auto buff =
new int64_t[num_out_frags];
608 out_vec.push_back(static_cast<int64_t*>(buff));
613 CHECK_EQ(num_rows.size(), col_buffers.size());
614 std::vector<int64_t> flatened_num_rows;
615 for (
auto& nums : num_rows) {
616 flatened_num_rows.insert(flatened_num_rows.end(), nums.begin(), nums.end());
618 std::vector<uint64_t> flatened_frag_offsets;
619 for (
auto& offsets : frag_offsets) {
620 flatened_frag_offsets.insert(
621 flatened_frag_offsets.end(), offsets.begin(), offsets.end());
623 const int64_t rowid_lookup_num_rows =
624 start_rowid ?
static_cast<int64_t
>(start_rowid) + 1 : 0;
625 int64_t
const* num_rows_ptr;
626 if (num_rows_to_process > 0) {
628 num_rows_ptr = flatened_num_rows.data();
631 rowid_lookup_num_rows ? &rowid_lookup_num_rows : flatened_num_rows.data();
633 int32_t total_matched_init{0};
635 std::vector<int64_t> cmpt_val_buff;
644 int8_t* row_func_mgr_ptr =
reinterpret_cast<int8_t*
>(&mgr);
647 const int64_t* join_hash_tables_ptr =
648 join_hash_tables.size() == 1
649 ?
reinterpret_cast<const int64_t*
>(join_hash_tables[0])
650 : (join_hash_tables.size() > 1
651 ?
reinterpret_cast<const int64_t*
>(&join_hash_tables[0])
653 VLOG(1) <<
"Calling " << native_code->
name() <<
" hoist_literals(" << hoist_literals
655 const int64_t*
const init_agg_value =
656 is_group_by ? cmpt_val_buff.data() : init_agg_vals.data();
657 int64_t**
const out =
658 is_group_by ?
query_buffers_->getGroupByBuffersPtr() : out_vec.data();
659 if (hoist_literals) {
670 flatened_frag_offsets.data(),
673 join_hash_tables_ptr,
685 flatened_frag_offsets.data(),
688 join_hash_tables_ptr,
696 if (rowid_lookup_num_rows && *error_code < 0) {
712 std::vector<std::vector<int8_t const*>>
const& col_buffers)
const {
713 if (
size_t const num_fragments = col_buffers.size()) {
714 size_t const col_bytes = col_buffers.front().size() *
sizeof(int8_t
const*);
716 return num_fragments *
sizeof(int8_t*) + num_fragments * col_bytes;
724 std::vector<std::vector<int8_t const*>>
const& col_buffers)
const {
725 if (
size_t const num_fragments = col_buffers.size()) {
726 size_t const col_bytes = col_buffers.front().size() *
sizeof(int8_t
const*);
727 int8_t* col_buffer_ptr = device_ptr + num_fragments *
sizeof(int8_t*);
729 for (
size_t i = 0; i < num_fragments; ++i) {
730 gpu_allocator_->copyToDevice(device_ptr, &col_buffer_ptr,
sizeof(int8_t*));
731 device_ptr +=
sizeof(int8_t*);
732 col_buffer_ptr += col_bytes;
734 col_buffer_ptr = device_ptr;
735 for (
size_t i = 0; i < num_fragments; ++i) {
736 CHECK_EQ(col_buffers.front().size(), col_buffers[i].size()) << i;
737 gpu_allocator_->copyToDevice(col_buffer_ptr, col_buffers[i].data(), col_bytes);
738 col_buffer_ptr += col_bytes;
743 template <
typename T>
745 uint32_t
const expected_subvector_size,
746 std::vector<std::vector<T>>
const& vec2d)
const {
747 return expected_subvector_size * vec2d.size() *
sizeof(
T);
749 template <
typename T>
752 uint32_t
const expected_subvector_size,
753 std::vector<std::vector<T>>
const& vec2d)
const {
754 size_t const bytes_per_subvector = expected_subvector_size *
sizeof(
T);
755 for (
size_t i = 0; i < vec2d.size(); ++i) {
756 CHECK_EQ(expected_subvector_size, vec2d[i].size()) << i <<
'/' << vec2d.size();
757 gpu_allocator_->copyToDevice(device_ptr, vec2d[i].data(), bytes_per_subvector);
758 device_ptr += bytes_per_subvector;
763 bool const is_group_by,
764 std::vector<int64_t>
const& init_agg_vals)
const {
767 return cmpt_sz *
sizeof(int64_t);
769 return init_agg_vals.size() *
sizeof(int64_t);
774 bool const is_group_by,
775 std::vector<int64_t>
const& init_agg_vals)
const {
780 }
else if (init_agg_vals.size()) {
786 std::vector<int8_t*>
const& join_hash_tables)
const {
787 return join_hash_tables.size() < 2u ? 0u : join_hash_tables.size() *
sizeof(int8_t*);
791 std::vector<int8_t*>
const& join_hash_tables)
const {
792 switch (join_hash_tables.size()) {
796 return join_hash_tables[0];
804 std::vector<int8_t>
const& literal_buff)
const {
805 size_t const count_distinct_bytes =
806 query_buffers_->getCountDistinctBitmapDevicePtr() ? 2 *
sizeof(int64_t) : 0u;
807 return count_distinct_bytes + literal_buff.size();
814 std::vector<int8_t>
const& literal_buff)
const {
817 int64_t count_distinct_addresses[2];
818 size_t const count_distinct_bytes =
query_buffers_->getCountDistinctBitmapDevicePtr()
819 ?
sizeof count_distinct_addresses
821 CHECK_EQ(0u, uint64_t(device_ptr) % 8);
823 if (count_distinct_bytes) {
825 auto const count_distinct_bitmap_host_mem =
827 CHECK(count_distinct_bitmap_host_mem);
828 count_distinct_addresses[0] =
829 reinterpret_cast<int64_t
>(count_distinct_bitmap_host_mem);
830 count_distinct_addresses[1] =
831 static_cast<int64_t
>(
query_buffers_->getCountDistinctBitmapDevicePtr());
833 device_ptr, count_distinct_addresses, count_distinct_bytes);
834 device_ptr += count_distinct_bytes;
836 if (!literal_buff.empty()) {
837 gpu_allocator_->copyToDevice(device_ptr, literal_buff.data(), literal_buff.size());
842 template <
typename T>
847 template <
typename T>
849 return vec.size() *
sizeof(
T);
851 template <
typename T>
853 std::vector<T>
const& vec)
const {
854 gpu_allocator_->copyToDevice(device_ptr, vec.data(), vec.size() *
sizeof(
T));
858 const std::vector<std::vector<const int8_t*>>& col_buffers,
859 const std::vector<int8_t>& literal_buff,
860 const std::vector<std::vector<int64_t>>& num_rows,
861 const std::vector<std::vector<uint64_t>>& frag_offsets,
862 const int32_t scan_limit,
863 const std::vector<int64_t>& init_agg_vals,
864 const std::vector<int32_t>& error_codes,
865 const uint32_t num_tables,
866 const std::vector<int8_t*>& join_hash_tables,
869 const bool hoist_literals,
870 const bool is_group_by)
const {
872 CHECK(literal_buff.empty() || hoist_literals) << literal_buff.size();
873 CHECK_EQ(num_rows.size(), col_buffers.size());
874 CHECK_EQ(frag_offsets.size(), col_buffers.size());
882 param_sizes[
NUM_TABLES] = align_to<8>(
sizeof(num_tables));
888 param_sizes[
MAX_MATCHED] = align_to<8>(
sizeof(scan_limit));
892 auto const nbytes =
std::accumulate(param_sizes.begin(), param_sizes.end(), size_t(0));
898 for (
size_t i = 1; i < params.size(); ++i) {
899 params[i] = params[i - 1] + param_sizes[i - 1];
const Executor * executor_
size_t getSlotCount() const
RenderAllocator * getRenderAllocator(size_t device_id)
void call(Ts...args) const
size_t getEntryCount() const
int8_t * copyJoinHashTablesToDevice(int8_t *device_ptr, std::vector< int8_t * > const &join_hash_tables) const
std::unique_ptr< DeviceAllocator > gpu_allocator_
size_t get_num_allocated_rows_from_gpu(DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id)
QueryExecutionContext(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool output_columnar, const bool sort_on_gpu, const size_t thread_idx, RenderInfo *)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::string & name() const
Streaming Top N algorithm.
bool hasVarlenOutput() const
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t start_rowid, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, const int64_t num_rows_to_process=-1)
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
const ExecutorDispatchMode dispatch_mode_
bool hasKeylessHash() const
size_t num_rows_to_process(const size_t start_row_index, const size_t max_fragment_size, const size_t rows_remaining)
std::shared_ptr< ResultSet > ResultSetPtr
bool g_enable_dynamic_watchdog
std::array< int8_t *, KERN_PARAM_COUNT > KernelParams
bool g_enable_non_kernel_time_query_interrupt
void copyColBuffersToDevice(int8_t *device_ptr, std::vector< std::vector< int8_t const * >> const &col_buffers) const
void inplace_sort_gpu(const std::list< Analyzer::OrderEntry > &order_entries, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &group_by_buffers, Data_Namespace::DataMgr *data_mgr, const int device_id)
const ExecutorDeviceType device_type_
const bool output_columnar_
std::unique_ptr< QueryMemoryInitializer > query_buffers_
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
int8_t * copyLiteralsToDevice(int8_t *device_ptr, std::vector< int8_t > const &literal_buff) const
bool useStreamingTopN() const
#define INJECT_TIMER(DESC)
std::vector< int64_t > compact_init_vals(const size_t cmpt_size, const std::vector< int64_t > &init_vec, const QueryMemoryDescriptor &query_mem_desc)
std::list< Analyzer::OrderEntry > order_entries
void copyValueToDevice(int8_t *device_ptr, T const value) const
size_t sizeofFlattened2dVec(uint32_t const expected_subvector_size, std::vector< std::vector< T >> const &vec2d) const
DEVICE auto accumulate(ARGS &&...args)
void copyFlattened2dVecToDevice(int8_t *device_ptr, uint32_t const expected_subvector_size, std::vector< std::vector< T >> const &vec2d) const
int64_t getAggInitValForIndex(const size_t index) const
size_t sizeofInitAggVals(bool const is_group_by, std::vector< int64_t > const &init_agg_vals) const
const std::shared_ptr< Analyzer::Estimator > estimator
void copyVectorToDevice(int8_t *device_ptr, std::vector< T > const &vec) const
size_t sizeofJoinHashTables(std::vector< int8_t * > const &join_hash_tables) const
QueryDescriptionType getQueryDescriptionType() const
QueryMemoryDescriptor query_mem_desc_
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Speculative top N algorithm.
ResultSetPtr groupBufferToDeinterleavedResults(const size_t i) const
std::array< size_t, KERN_PARAM_COUNT > KernelParamSizes
ResultSetPtr groupBufferToResults(const size_t i) const
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
Descriptor for the result set buffer layout.
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
KernelParams prepareKernelParams(const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< int8_t > &literal_buff, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, const int32_t scan_limit, const std::vector< int64_t > &init_agg_vals, const std::vector< int32_t > &error_codes, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, Data_Namespace::DataMgr *data_mgr, const int device_id, const bool hoist_literals, const bool is_group_by) const
size_t getColsSize() const
bool didOutputColumnar() const
bool interleavedBins(const ExecutorDeviceType) const
void copyInitAggValsToDevice(int8_t *device_ptr, bool const is_group_by, std::vector< int64_t > const &init_agg_vals) const
bool threadsShareMemory() const
#define DEBUG_TIMER(name)
int32_t aggregate_error_codes(const std::vector< int32_t > &error_codes)
Basic constructors and methods of the row set interface.
Execution unit for relational algebra. It's a low-level description of any relational algebra operati...
std::unique_ptr< ResultSet > estimator_result_set_
unsigned g_dynamic_watchdog_time_limit
size_t getColOffInBytes(const size_t col_idx) const
static bool reduceSingleRow(const int8_t *row_ptr, const int8_t warp_count, const bool is_columnar, const bool replace_bitmap_ptr_with_bitmap_sz, std::vector< int64_t > &agg_vals, const QueryMemoryDescriptor &query_mem_desc, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &agg_init_vals)
size_t getColOffInBytesInNextBin(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
size_t sizeofVector(std::vector< T > const &vec) const
size_t sizeofColBuffers(std::vector< std::vector< int8_t const * >> const &col_buffers) const
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CompilationContext *compilation_context, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const bool allow_runtime_interrupt, const std::vector< int8_t * > &join_hash_tables, RenderAllocatorMap *render_allocator_map, bool optimize_cuda_block_and_grid_sizes)
size_t sizeofLiterals(std::vector< int8_t > const &literal_buff) const