41 const size_t groups_buffer_entry_count = query_mem_desc.
getEntryCount();
43 const size_t num_count_distinct_descs =
45 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
52 int64_t total_bytes{0};
56 total_bytes =
static_cast<int64_t
>(total_bytes_per_group * groups_buffer_entry_count);
68 const size_t numBytes,
70 const size_t thread_idx,
72 const bool reuse_existing_buffer_for_thread) {
73 if (render_allocator_map) {
77 const auto gpu_idx = 0;
79 return std::make_pair(
80 reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes)),
false);
81 }
else if (reuse_existing_buffer_for_thread) {
84 return std::make_pair(
85 reinterpret_cast<int64_t*>(mem_owner->
allocate(numBytes, thread_idx)),
false);
89 if (frag_offsets.size() < 2) {
92 const auto frag_size = frag_offsets[1] - frag_offsets[0];
93 for (
size_t i = 2; i < frag_offsets.size(); ++i) {
94 const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
95 if (curr_size != frag_size) {
99 return !frag_size ? std::numeric_limits<int64_t>::max()
100 :
static_cast<int64_t
>(frag_size);
104 const std::vector<std::vector<uint64_t>>& frag_offsets) {
105 if (frag_offsets.empty()) {
108 std::vector<int64_t> frag_sizes;
109 for (
size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
110 std::vector<uint64_t> tab_offs;
111 for (
auto& offsets : frag_offsets) {
112 tab_offs.push_back(offsets[tab_idx]);
120 const std::vector<Analyzer::Expr*>& target_exprs,
121 const std::vector<int64_t>& table_frag_sizes) {
122 std::vector<int64_t> col_frag_sizes;
123 for (
auto expr : target_exprs) {
124 if (
const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
125 if (col_var->get_rte_idx() < 0) {
126 CHECK_EQ(-1, col_var->get_rte_idx());
127 col_frag_sizes.push_back(int64_t(-1));
129 col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
132 col_frag_sizes.push_back(int64_t(-1));
135 return col_frag_sizes;
139 const std::vector<Analyzer::Expr*>& target_exprs,
140 const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
141 std::vector<std::vector<int64_t>> col_frag_offsets;
142 for (
auto& table_offsets : table_frag_offsets) {
143 std::vector<int64_t> col_offsets;
144 for (
auto expr : target_exprs) {
145 if (
const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
146 if (col_var->get_rte_idx() < 0) {
147 CHECK_EQ(-1, col_var->get_rte_idx());
148 col_offsets.push_back(int64_t(-1));
150 CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
151 col_offsets.push_back(
152 static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
155 col_offsets.push_back(int64_t(-1));
158 col_frag_offsets.push_back(col_offsets);
160 return col_frag_offsets;
167 auto match_table_key = [=](
auto& desc) {
168 return outer_table_key == desc.getTableKey();
171 auto itr = std::find_if(input_descs.begin(), input_descs.end(), match_table_key);
172 return itr == input_descs.end() ? 0 : itr->getNestLevel();
177 const size_t agg_col_count{query_mem_desc.
getSlotCount()};
179 for (
size_t target_idx = 0; target_idx < ra_exe_unit.
target_exprs.size();
181 const auto target_expr = ra_exe_unit.
target_exprs[target_idx];
184 CHECK(agg_info.is_agg &&
187 CHECK(!agg_info.sql_type.is_varlen());
189 CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
192 const auto& count_distinct_desc =
210 if (
auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(expr)) {
211 if (agg_expr->get_aggtype() ==
kMODE) {
218 return agg_op_metadata;
230 const bool output_columnar,
233 const int64_t num_rows,
234 const std::vector<std::vector<const int8_t*>>& col_buffers,
235 const std::vector<std::vector<uint64_t>>& frag_offsets,
238 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
240 const size_t thread_idx,
241 const Executor* executor)
242 : num_rows_(num_rows)
243 , row_set_mem_owner_(row_set_mem_owner)
244 , init_agg_vals_(executor->plan_state_->init_agg_vals_)
245 ,
num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
254 CHECK(!sort_on_gpu || output_columnar);
255 executor->logSystemCPUMemoryStatus(
"Before Query Memory Initialization", thread_idx);
258 if (consistent_frag_sizes.empty()) {
275 size_t total_buffer_size{0};
277 if (buffer_size > 0) {
278 total_buffer_size += buffer_size;
289 size_t const capacity =
291 VLOG(2) <<
"row_set_mem_owner_->reserveTDigestMemory(" <<
thread_idx_ <<
','
292 << capacity <<
") query_mem_desc.getEntryCount()("
297 if (render_allocator_map || !query_mem_desc.
isGroupBy()) {
328 ? executor->blockSize() * executor->gridSize()
331 size_t group_buffer_size{0};
336 group_buffer_size = num_rows * query_mem_desc.
getRowSize();
345 CHECK_GE(group_buffer_size,
size_t(0));
348 int64_t* group_by_buffer_template{
nullptr};
350 if (!query_mem_desc.
lazyInitGroups(device_type) && group_buffers_count > 1) {
351 group_by_buffer_template =
reinterpret_cast<int64_t*
>(
369 ? executor->blockSize()
375 const auto actual_group_buffer_size =
376 group_buffer_size + index_buffer_qw *
sizeof(int64_t);
377 CHECK_GE(actual_group_buffer_size, group_buffer_size);
381 CHECK(varlen_buffer_elem_size_opt);
382 auto const varlen_buffer_sz =
383 query_mem_desc.
getEntryCount() * varlen_buffer_elem_size_opt.value();
384 auto varlen_output_buffer =
397 CHECK_EQ(group_buffers_count,
size_t(1));
404 for (
size_t i = 0; i < group_buffers_count; i += step) {
407 render_allocator_map,
412 auto group_by_buffer = group_by_info.first;
413 const bool was_cached = group_by_info.second;
416 if (group_by_buffer_template) {
417 memcpy(group_by_buffer + index_buffer_qw,
418 group_by_buffer_template,
436 const bool use_target_exprs_union =
441 const auto column_frag_sizes =
445 result_sets_.resize(old_size + std::max(
size_t(1), step));
448 executor->getColLazyFetchInfo(target_exprs),
457 executor->blockSize(),
458 executor->gridSize());
459 result_sets_[old_size]->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
460 executor->plan_state_->init_agg_vals_,
471 const int64_t num_rows,
472 const std::vector<std::vector<const int8_t*>>& col_buffers,
473 const std::vector<std::vector<uint64_t>>& frag_offsets,
474 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
476 const Executor* executor)
477 : num_rows_(num_rows)
478 , row_set_mem_owner_(row_set_mem_owner)
491 if (consistent_frag_sizes.empty()) {
496 const size_t num_columns =
498 size_t total_group_by_buffer_size{0};
499 for (
size_t i = 0; i < num_columns; ++i) {
500 auto ti = exe_unit.target_exprs[i]->get_type_info();
501 if (ti.usesFlatBuffer()) {
509 static_cast<int64_t>(total_group_by_buffer_size + flatbuffer_size));
518 static_cast<int64_t>(group_buffer_size + total_group_by_buffer_size));
525 #ifdef __SANITIZE_ADDRESS__
527 #define MAX_BUFFER_SIZE 0x10000000000ll
531 #define MAX_BUFFER_SIZE 0x100000000000ll
542 row_set_mem_owner.get(),
545 group_by_buffers_.push_back(group_by_buffer);
547 const auto column_frag_offsets =
549 const auto column_frag_sizes =
551 result_sets_.emplace_back(
562 executor->blockSize(),
563 executor->gridSize()));
564 result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
574 const bool output_columnar,
575 const Executor* executor) {
576 if (output_columnar) {
579 auto rows_ptr = buffer;
582 ? executor->blockSize() * executor->gridSize()
585 query_mem_desc.
interleavedBins(device_type) ? executor->warpSize() : 1;
587 const auto node_count_size = thread_count *
sizeof(int64_t);
588 memset(rows_ptr, 0, node_count_size);
592 memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
593 rows_ptr += rows_offset /
sizeof(int64_t);
594 actual_entry_count =
n * thread_count;
609 int64_t* groups_buffer,
610 const std::vector<int64_t>& init_vals,
612 const int32_t groups_buffer_entry_count,
613 const size_t warp_size,
614 const Executor* executor,
617 const size_t row_size{query_mem_desc.
getRowSize()};
620 auto buffer_ptr =
reinterpret_cast<int8_t*
>(groups_buffer);
621 const auto query_mem_desc_fixedup =
629 std::vector<int8_t> sample_row(row_size - col_base_off);
630 auto const num_available_cpu_threads =
633 tbb::task_arena initialization_arena(num_available_cpu_threads);
636 query_mem_desc_fixedup, sample_row.data(), init_vals, agg_op_metadata);
639 CHECK(warp_size >= 1);
640 CHECK(key_count == 1 || warp_size == 1);
641 initialization_arena.execute([&] {
643 tbb::blocked_range<size_t>(0, groups_buffer_entry_count * warp_size),
644 [&](
const tbb::blocked_range<size_t>& r) {
645 auto cur_row_buf = buffer_ptr + (row_size * r.begin());
646 for (
size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
647 memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
653 initialization_arena.execute([&] {
655 tbb::blocked_range<size_t>(0, groups_buffer_entry_count),
656 [&](
const tbb::blocked_range<size_t>& r) {
657 auto cur_row_buf = buffer_ptr + (row_size * r.begin());
658 for (
size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
659 memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
667 CHECK(warp_size >= 1);
668 CHECK(key_count == 1 || warp_size == 1);
669 for (
size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
670 for (
size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
671 ++bin, buffer_ptr += row_size) {
673 &buffer_ptr[col_base_off],
681 for (
size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
682 ++bin, buffer_ptr += row_size) {
686 query_mem_desc_fixedup, &buffer_ptr[col_base_off], init_vals, agg_op_metadata);
693 template <
typename T>
695 static_assert(
sizeof(
T) <=
sizeof(int64_t),
"Unsupported template type");
696 for (uint32_t i = 0; i < entry_count; ++i) {
697 buffer_ptr[i] = init_val;
699 return reinterpret_cast<int8_t*
>(buffer_ptr + entry_count);
706 int64_t* groups_buffer,
707 const std::vector<int64_t>& init_vals,
708 const Executor* executor,
710 CHECK(groups_buffer);
712 for (
const auto target_expr : ra_exe_unit.
target_exprs) {
716 const int32_t agg_col_count = query_mem_desc.
getSlotCount();
717 auto buffer_ptr =
reinterpret_cast<int8_t*
>(groups_buffer);
719 const auto groups_buffer_entry_count = query_mem_desc.
getEntryCount();
722 for (
size_t i = 0; i < key_count; ++i) {
723 buffer_ptr = initColumnarBuffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
725 groups_buffer_entry_count);
731 int32_t init_val_idx = 0;
732 for (int32_t i = 0; i < agg_col_count; ++i) {
734 CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
737 buffer_ptr = initColumnarBuffer<int8_t>(
738 buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
742 initColumnarBuffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
743 init_vals[init_val_idx++],
744 groups_buffer_entry_count);
748 initColumnarBuffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
749 init_vals[init_val_idx++],
750 groups_buffer_entry_count);
754 initColumnarBuffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
755 init_vals[init_val_idx++],
756 groups_buffer_entry_count);
773 const std::vector<int64_t>& init_vals,
775 int8_t* col_ptr = row_ptr;
776 size_t init_vec_idx = 0;
777 size_t approx_quantile_descriptors_idx = 0;
778 for (
size_t col_idx = 0; col_idx < query_mem_desc.
getSlotCount();
797 auto const& desc = descs.at(approx_quantile_descriptors_idx++);
798 init_val =
reinterpret_cast<int64_t
>(
802 }
else if (agg_op_metadata.
has_mode &&
810 if (init_val == 0 && col_slot_width > 0) {
811 CHECK_LT(init_vec_idx, init_vals.size());
812 init_val = init_vals[init_vec_idx++];
814 switch (col_slot_width) {
816 *col_ptr =
static_cast<int8_t
>(init_val);
819 *
reinterpret_cast<int16_t*
>(col_ptr) = (int16_t)init_val;
822 *
reinterpret_cast<int32_t*
>(col_ptr) = (int32_t)init_val;
825 *
reinterpret_cast<int64_t*
>(col_ptr) = init_val;
842 size_t total_bytes_per_entry{0};
843 const size_t num_count_distinct_descs =
845 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
851 total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
868 const size_t agg_col_count{query_mem_desc.
getSlotCount()};
869 std::vector<int64_t> agg_bitmap_size(agg_col_count);
870 for (
size_t target_idx = 0; target_idx < ra_exe_unit.
target_exprs.size();
872 const auto target_expr = ra_exe_unit.
target_exprs[target_idx];
876 const auto& count_distinct_desc =
880 agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
883 agg_bitmap_size[agg_col_idx] = -1;
887 return agg_bitmap_size;
893 for (
size_t target_idx = 0; target_idx < ra_exe_unit.
target_exprs.size();
895 const auto target_expr = ra_exe_unit.
target_exprs[target_idx];
899 const auto& count_distinct_desc =
918 ptr, bitmap_byte_sz,
false);
919 return reinterpret_cast<int64_t
>(ptr);
921 return reinterpret_cast<int64_t
>(
928 return reinterpret_cast<int64_t
>(count_distinct_set);
934 size_t const slot_count = query_mem_desc.
getSlotCount();
938 size_t const target_idx) {
941 mode_index_set.emplace(agg_col_idx);
943 return mode_index_set;
949 size_t const slot_count = query_mem_desc.
getSlotCount();
952 size_t const target_idx) {
956 init_agg_vals_[agg_col_idx] =
reinterpret_cast<int64_t
>(agg_mode);
960 std::vector<QueryMemoryInitializer::QuantileParam>
964 size_t const slot_count = query_mem_desc.
getSlotCount();
966 std::vector<QuantileParam> quantile_params(slot_count);
968 size_t const target_idx) {
971 CHECK_EQ(static_cast<int8_t>(
sizeof(int64_t)),
976 quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
978 return quantile_params;
984 size_t const slot_count = query_mem_desc.
getSlotCount();
988 size_t approx_quantile_descriptors_idx = 0u;
990 size_t const target_idx) {
993 CHECK_EQ(static_cast<int8_t>(
sizeof(int64_t)),
998 auto const q = q_expr->get_constval().doubleval;
999 auto const& desc = descs.at(approx_quantile_descriptors_idx++);
1007 const int8_t* init_agg_vals_dev_ptr,
1009 const int device_id,
1010 const unsigned block_size_x,
1011 const unsigned grid_size_x) {
1014 const auto thread_count = block_size_x * grid_size_x;
1015 const auto total_buff_size =
1019 std::vector<int8_t*> dev_buffers(thread_count);
1021 for (
size_t i = 0; i < thread_count; ++i) {
1022 dev_buffers[i] = dev_buffer;
1027 dev_ptr, dev_buffers.data(), thread_count *
sizeof(int8_t*));
1032 thread_count *
sizeof(int64_t));
1035 reinterpret_cast<int8_t*>(dev_buffer + thread_count *
sizeof(int64_t)),
1037 thread_count * n *
sizeof(int64_t));
1040 reinterpret_cast<int64_t*>(
1042 reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1046 query_mem_desc.
getRowSize() /
sizeof(int64_t),
1052 return {dev_ptr, dev_buffer};
1062 const int8_t* init_agg_vals_dev_ptr,
1063 const int device_id,
1065 const unsigned block_size_x,
1066 const unsigned grid_size_x,
1067 const int8_t warp_size,
1068 const bool can_sort_on_gpu,
1069 const bool output_columnar,
1073 if (render_allocator) {
1077 CHECK(!output_columnar);
1080 query_mem_desc, init_agg_vals_dev_ptr,
n, device_id, block_size_x, grid_size_x);
1083 auto dev_group_by_buffers =
1098 CHECK(dev_group_by_buffers.varlen_output_buffer);
1100 reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
1102 const size_t varlen_output_buf_bytes =
1111 if (render_allocator) {
1115 CHECK(!render_allocator);
1120 auto group_by_dev_buffer = dev_group_by_buffers.data;
1121 const size_t col_count = query_mem_desc.
getSlotCount();
1122 int8_t* col_widths_dev_ptr{
nullptr};
1123 if (output_columnar) {
1124 std::vector<int8_t> compact_col_widths(col_count);
1125 for (
size_t idx = 0; idx < col_count; ++idx) {
1130 col_widths_dev_ptr, compact_col_widths.data(), col_count *
sizeof(int8_t));
1132 const int8_t warp_count =
1134 const auto num_group_by_buffers =
1136 for (
size_t i = 0; i < num_group_by_buffers; i += step) {
1137 if (output_columnar) {
1139 reinterpret_cast<int64_t*>(group_by_dev_buffer),
1140 reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1141 dev_group_by_buffers.entry_count,
1152 reinterpret_cast<int64_t*>(group_by_dev_buffer),
1153 reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1154 dev_group_by_buffers.entry_count,
1157 query_mem_desc.
getRowSize() /
sizeof(int64_t),
1163 group_by_dev_buffer += groups_buffer_size;
1166 return dev_group_by_buffers;
1175 const int device_id,
1176 const unsigned block_size_x,
1177 const unsigned grid_size_x,
1178 const bool zero_initialize_buffers) {
1181 size_t total_group_by_buffer_size{0};
1184 std::vector<size_t> col_byte_offsets;
1185 col_byte_offsets.reserve(num_columns);
1187 for (
size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1189 size_t group_buffer_size =
num_rows_ * col_width;
1190 col_byte_offsets.emplace_back(total_group_by_buffer_size);
1191 total_group_by_buffer_size =
1195 int8_t* dev_buffers_allocation{
nullptr};
1197 CHECK(dev_buffers_allocation);
1198 if (zero_initialize_buffers) {
1202 auto dev_buffers_mem = dev_buffers_allocation;
1203 std::vector<int8_t*> dev_buffers(num_columns);
1204 for (
size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1205 dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1209 dev_ptrs, dev_buffers.data(), num_columns *
sizeof(
CUdeviceptr));
1211 return {dev_ptrs, dev_buffers_mem, (size_t)
num_rows_};
1217 const size_t entry_count,
1219 const int device_id,
1220 const unsigned block_size_x,
1221 const unsigned grid_size_x) {
1224 int8_t* dev_buffer = gpu_group_by_buffers.
data;
1227 const size_t original_entry_count = gpu_group_by_buffers.
entry_count;
1228 CHECK_LE(entry_count, original_entry_count);
1229 size_t output_device_col_offset{0};
1230 size_t output_host_col_offset{0};
1234 auto allocator = std::make_unique<CudaAllocator>(
1237 for (
size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1239 const size_t output_device_col_size = original_entry_count * col_width;
1240 const size_t output_host_col_size = entry_count * col_width;
1241 allocator->copyFromDevice(host_buffer + output_host_col_offset,
1242 dev_buffer + output_device_col_offset,
1243 output_host_col_size);
1244 output_device_col_offset =
1245 align_to_int64(output_device_col_offset + output_device_col_size);
1246 output_host_col_offset =
1257 : executor->blockSize() *
1266 int8_t* projection_buffer,
1267 const size_t projection_count) {
1270 constexpr
size_t row_index_width =
sizeof(int64_t);
1271 size_t buffer_offset1{projection_count * row_index_width};
1273 for (
size_t i = 0; i < query_mem_desc.
getSlotCount(); i++) {
1275 auto column_proj_size =
1278 if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1280 std::memmove(projection_buffer + buffer_offset1,
1281 projection_buffer + buffer_offset2,
1284 std::memcpy(projection_buffer + buffer_offset1,
1285 projection_buffer + buffer_offset2,
1297 const size_t projection_count) {
1298 const auto num_allocated_rows =
1300 const size_t buffer_start_idx = query_mem_desc.
hasVarlenOutput() ? 1 : 0;
1306 num_allocated_rows);
1310 result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1317 const size_t projection_count,
1318 const int device_id) {
1320 const auto num_allocated_rows =
1324 const size_t buffer_start_idx = query_mem_desc.
hasVarlenOutput() ? 1 : 0;
1327 gpu_group_by_buffers,
1335 result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1341 const size_t entry_count,
1344 const unsigned block_size_x,
1345 const unsigned grid_size_x,
1346 const int device_id,
1347 const bool prepend_index_buffer)
const {
1348 const auto thread_count = block_size_x * grid_size_x;
1350 size_t total_buff_size{0};
1363 gpu_group_by_buffers.
data,
1368 prepend_index_buffer,
1375 const size_t buffer_start_idx = query_mem_desc.
hasVarlenOutput() ? 1 : 0;
1393 const unsigned total_thread_count,
1394 const int device_id) {
1397 const size_t buffer_start_idx = query_mem_desc.
hasVarlenOutput() ? 1 : 0;
1399 const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1401 reinterpret_cast<int64_t*>(gpu_group_by_buffers.
data),
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs
size_t getSlotCount() const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
RenderAllocator * getRenderAllocator(size_t device_id)
robin_hood::unordered_set< int64_t > CountDistinctSet
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
bool useCudaBuffers() const
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
QueryMemoryInitializer::TargetAggOpsMetadata collect_target_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
size_t getAvailableCpuThreads() const
count_distinct_bitmap_host_mem_ptr_(nullptr)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
Streaming Top N algorithm.
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
bool hasVarlenOutput() const
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
unsigned long long CUdeviceptr
std::vector< InputDescriptor > input_descs
bool hasKeylessHash() const
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
count_distinct_bitmap_device_mem_ptr_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
const ApproxQuantileDescriptors & getApproxQuantileDescriptors() const
size_t getEffectiveKeyWidth() const
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
count_distinct_bitmap_mem_size_(0)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
size_t getRowSize() const
std::pair< int64_t *, bool > allocateCachedGroupByBuffer(const size_t num_bytes, const size_t thread_idx)
void check_count_distinct_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
size_t count_distinct_bitmap_mem_size_
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
int8_t * allocate(const size_t num_bytes) override
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
bool blocksShareMemory() const
std::pair< int64_t *, bool > alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner, const bool reuse_existing_buffer_for_thread)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
int8_t * count_distinct_bitmap_host_mem_ptr_
static IndexType nbytes(IndexType buf_allocate, IndexType centroids_allocate)
std::vector< int64_t > init_agg_vals_
bool useStreamingTopN() const
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool threadsCanReuseGroupByBuffers() const
std::optional< size_t > limit
count_distinct_bitmap_host_crt_ptr_(nullptr)
DEVICE auto accumulate(ARGS &&...args)
int64_t g_bitmap_memory_limit
size_t g_max_memory_allocation_size
size_t operator()(size_t const sum, ApproxQuantileDescriptor const aqd) const
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
int8_t * varlen_output_buffer_host_ptr_
size_t getCountDistinctDescriptorsSize() const
int64_t allocateCountDistinctSet()
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t bitmapPaddedSizeBytes() const
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
robin_hood::unordered_set< size_t > ModeIndexSet
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
const ColSlotContext & getColSlotContext() const
bool threadsShareMemory() const
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
int8_t * count_distinct_bitmap_host_crt_ptr_
CUdeviceptr varlen_output_buffer_
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
size_t getBufferColSlotCount() const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)