OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 #include "Execute.h"
19 #include "GpuInitGroups.h"
20 #include "Logger/Logger.h"
23 #include "Shared/checked_alloc.h"
24 #include "StreamingTopN.h"
25 #include "Utils/FlatBuffer.h"
26 
27 // 8 GB, the limit of perfect hash group by under normal conditions
28 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
29 
30 namespace {
31 
32 struct AddNbytes {
33  size_t const entry_count;
34  size_t operator()(size_t const sum, ApproxQuantileDescriptor const aqd) const {
35  return sum +
37  }
38 };
39 
41  const size_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
42  checked_int64_t total_bytes_per_group = 0;
43  const size_t num_count_distinct_descs =
44  query_mem_desc.getCountDistinctDescriptorsSize();
45  for (size_t i = 0; i < num_count_distinct_descs; i++) {
46  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
47  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
48  continue;
49  }
50  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
51  }
52  int64_t total_bytes{0};
53  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
54  // caught
55  try {
56  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
57  } catch (...) {
58  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
59  // Don't bother to report the real amount, this is unlikely to ever happen.
60  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
61  }
62  if (total_bytes >= g_bitmap_memory_limit) {
63  throw OutOfHostMemory(total_bytes);
64  }
65 }
66 
67 std::pair<int64_t*, bool> alloc_group_by_buffer(
68  const size_t numBytes,
69  RenderAllocatorMap* render_allocator_map,
70  const size_t thread_idx,
71  RowSetMemoryOwner* mem_owner,
72  const bool reuse_existing_buffer_for_thread) {
73  if (render_allocator_map) {
74  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
75  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
76  // memory.
77  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
78  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
79  return std::make_pair(
80  reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes)), false);
81  } else if (reuse_existing_buffer_for_thread) {
82  return mem_owner->allocateCachedGroupByBuffer(numBytes, thread_idx);
83  }
84  return std::make_pair(
85  reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx)), false);
86 }
87 
88 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
89  if (frag_offsets.size() < 2) {
90  return int64_t(-1);
91  }
92  const auto frag_size = frag_offsets[1] - frag_offsets[0];
93  for (size_t i = 2; i < frag_offsets.size(); ++i) {
94  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
95  if (curr_size != frag_size) {
96  return int64_t(-1);
97  }
98  }
99  return !frag_size ? std::numeric_limits<int64_t>::max()
100  : static_cast<int64_t>(frag_size);
101 }
102 
103 inline std::vector<int64_t> get_consistent_frags_sizes(
104  const std::vector<std::vector<uint64_t>>& frag_offsets) {
105  if (frag_offsets.empty()) {
106  return {};
107  }
108  std::vector<int64_t> frag_sizes;
109  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
110  std::vector<uint64_t> tab_offs;
111  for (auto& offsets : frag_offsets) {
112  tab_offs.push_back(offsets[tab_idx]);
113  }
114  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
115  }
116  return frag_sizes;
117 }
118 
119 inline std::vector<int64_t> get_consistent_frags_sizes(
120  const std::vector<Analyzer::Expr*>& target_exprs,
121  const std::vector<int64_t>& table_frag_sizes) {
122  std::vector<int64_t> col_frag_sizes;
123  for (auto expr : target_exprs) {
124  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
125  if (col_var->get_rte_idx() < 0) {
126  CHECK_EQ(-1, col_var->get_rte_idx());
127  col_frag_sizes.push_back(int64_t(-1));
128  } else {
129  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
130  }
131  } else {
132  col_frag_sizes.push_back(int64_t(-1));
133  }
134  }
135  return col_frag_sizes;
136 }
137 
138 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
139  const std::vector<Analyzer::Expr*>& target_exprs,
140  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
141  std::vector<std::vector<int64_t>> col_frag_offsets;
142  for (auto& table_offsets : table_frag_offsets) {
143  std::vector<int64_t> col_offsets;
144  for (auto expr : target_exprs) {
145  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
146  if (col_var->get_rte_idx() < 0) {
147  CHECK_EQ(-1, col_var->get_rte_idx());
148  col_offsets.push_back(int64_t(-1));
149  } else {
150  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
151  col_offsets.push_back(
152  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
153  }
154  } else {
155  col_offsets.push_back(int64_t(-1));
156  }
157  }
158  col_frag_offsets.push_back(col_offsets);
159  }
160  return col_frag_offsets;
161 }
162 
163 // Return the RelAlg input index of outer_table_id based on ra_exe_unit.input_descs.
164 // Used by UNION queries to get the target_exprs corresponding to the current subquery.
165 int get_input_idx(RelAlgExecutionUnit const& ra_exe_unit,
166  const shared::TableKey& outer_table_key) {
167  auto match_table_key = [=](auto& desc) {
168  return outer_table_key == desc.getTableKey();
169  };
170  auto& input_descs = ra_exe_unit.input_descs;
171  auto itr = std::find_if(input_descs.begin(), input_descs.end(), match_table_key);
172  return itr == input_descs.end() ? 0 : itr->getNestLevel();
173 }
174 
176  const RelAlgExecutionUnit& ra_exe_unit) {
177  const size_t agg_col_count{query_mem_desc.getSlotCount()};
178  CHECK_GE(agg_col_count, ra_exe_unit.target_exprs.size());
179  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
180  ++target_idx) {
181  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
182  const auto agg_info = get_target_info(target_expr, g_bigint_count);
183  if (is_distinct_target(agg_info)) {
184  CHECK(agg_info.is_agg &&
185  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kCOUNT_IF ||
186  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
187  CHECK(!agg_info.sql_type.is_varlen());
188  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
189  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
190  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
191  sizeof(int64_t));
192  const auto& count_distinct_desc =
193  query_mem_desc.getCountDistinctDescriptor(target_idx);
194  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
195  }
196  }
197 }
198 
201  const RelAlgExecutionUnit& ra_exe_unit) {
203  if (!query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
204  agg_op_metadata.has_count_distinct = true;
205  }
206  std::for_each(
207  ra_exe_unit.target_exprs.begin(),
208  ra_exe_unit.target_exprs.end(),
209  [&agg_op_metadata](const Analyzer::Expr* expr) {
210  if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(expr)) {
211  if (agg_expr->get_aggtype() == kMODE) {
212  agg_op_metadata.has_mode = true;
213  } else if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
214  agg_op_metadata.has_tdigest = true;
215  }
216  }
217  });
218  return agg_op_metadata;
219 }
220 
221 } // namespace
222 
223 // Row-based execution constructor
225  const RelAlgExecutionUnit& ra_exe_unit,
227  const int device_id,
228  const ExecutorDeviceType device_type,
229  const ExecutorDispatchMode dispatch_mode,
230  const bool output_columnar,
231  const bool sort_on_gpu,
232  const shared::TableKey& outer_table_key,
233  const int64_t num_rows,
234  const std::vector<std::vector<const int8_t*>>& col_buffers,
235  const std::vector<std::vector<uint64_t>>& frag_offsets,
236  RenderAllocatorMap* render_allocator_map,
237  RenderInfo* render_info,
238  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
239  DeviceAllocator* device_allocator,
240  const size_t thread_idx,
241  const Executor* executor)
242  : num_rows_(num_rows)
243  , row_set_mem_owner_(row_set_mem_owner)
244  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
245  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
252  , device_allocator_(device_allocator)
253  , thread_idx_(thread_idx) {
254  CHECK(!sort_on_gpu || output_columnar);
255  executor->logSystemCPUMemoryStatus("Before Query Memory Initialization", thread_idx);
256 
257  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
258  if (consistent_frag_sizes.empty()) {
259  // No fragments in the input, no underlying buffers will be needed.
260  return;
261  }
262 
263  TargetAggOpsMetadata agg_op_metadata =
264  collect_target_expr_metadata(query_mem_desc, ra_exe_unit);
265  if (agg_op_metadata.has_count_distinct) {
266  check_count_distinct_expr_metadata(query_mem_desc, ra_exe_unit);
267  if (!ra_exe_unit.use_bump_allocator) {
268  check_total_bitmap_memory(query_mem_desc);
269  }
270  if (device_type == ExecutorDeviceType::GPU) {
271  allocateCountDistinctGpuMem(query_mem_desc);
272  }
273  agg_op_metadata.count_distinct_buf_size =
274  calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
275  size_t total_buffer_size{0};
276  for (auto buffer_size : agg_op_metadata.count_distinct_buf_size) {
277  if (buffer_size > 0) {
278  total_buffer_size += buffer_size;
279  }
280  }
281  total_buffer_size *= query_mem_desc.getEntryCount();
282  row_set_mem_owner_->initCountDistinctBufferAllocator(total_buffer_size, thread_idx_);
283  }
284 
285  if (agg_op_metadata.has_tdigest) {
286  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
287  // Pre-allocate all TDigest memory for this thread.
288  AddNbytes const add_nbytes{query_mem_desc.getEntryCount()};
289  size_t const capacity =
290  std::accumulate(descs.begin(), descs.end(), size_t(0), add_nbytes);
291  VLOG(2) << "row_set_mem_owner_->reserveTDigestMemory(" << thread_idx_ << ','
292  << capacity << ") query_mem_desc.getEntryCount()("
293  << query_mem_desc.getEntryCount() << ')';
294  row_set_mem_owner_->reserveTDigestMemory(thread_idx_, capacity);
295  }
296 
297  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
298  if (agg_op_metadata.has_count_distinct) {
299  allocateCountDistinctBuffers(query_mem_desc, ra_exe_unit);
300  }
301  if (agg_op_metadata.has_mode) {
302  allocateModeBuffer(query_mem_desc, ra_exe_unit);
303  }
304  if (agg_op_metadata.has_tdigest) {
305  allocateTDigestsBuffer(query_mem_desc, ra_exe_unit);
306  }
307  if (render_info && render_info->useCudaBuffers()) {
308  return;
309  }
310  }
311 
312  if (query_mem_desc.isGroupBy()) {
313  if (agg_op_metadata.has_mode) {
314  agg_op_metadata.mode_index_set =
315  initializeModeIndexSet(query_mem_desc, ra_exe_unit);
316  }
317  if (agg_op_metadata.has_tdigest) {
318  agg_op_metadata.quantile_params =
319  initializeQuantileParams(query_mem_desc, ra_exe_unit);
320  }
321  }
322 
323  if (ra_exe_unit.estimator) {
324  return;
325  }
326 
327  const auto thread_count = device_type == ExecutorDeviceType::GPU
328  ? executor->blockSize() * executor->gridSize()
329  : 1;
330 
331  size_t group_buffer_size{0};
332  if (ra_exe_unit.use_bump_allocator) {
333  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
334  // the fragment
335  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
336  group_buffer_size = num_rows * query_mem_desc.getRowSize();
337  } else {
338  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
339  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
340  }
341  } else {
342  group_buffer_size =
343  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
344  }
345  CHECK_GE(group_buffer_size, size_t(0));
346 
347  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
348  int64_t* group_by_buffer_template{nullptr};
349 
350  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
351  group_by_buffer_template = reinterpret_cast<int64_t*>(
352  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
353  initGroupByBuffer(group_by_buffer_template,
354  ra_exe_unit,
355  query_mem_desc,
356  agg_op_metadata,
357  device_type,
358  output_columnar,
359  executor);
360  }
361 
362  if (query_mem_desc.interleavedBins(device_type)) {
363  CHECK(query_mem_desc.hasKeylessHash());
364  }
365 
366  const auto step = device_type == ExecutorDeviceType::GPU &&
367  query_mem_desc.threadsShareMemory() &&
368  query_mem_desc.isGroupBy()
369  ? executor->blockSize()
370  : size_t(1);
371  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
372  query_mem_desc.hasKeylessHash()
373  ? query_mem_desc.getEntryCount()
374  : size_t(0);
375  const auto actual_group_buffer_size =
376  group_buffer_size + index_buffer_qw * sizeof(int64_t);
377  CHECK_GE(actual_group_buffer_size, group_buffer_size);
378 
379  if (query_mem_desc.hasVarlenOutput()) {
380  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
381  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
382  auto const varlen_buffer_sz =
383  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value();
384  auto varlen_output_buffer =
385  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(varlen_buffer_sz));
386  num_buffers_ += 1;
387  group_by_buffers_.push_back(varlen_output_buffer);
388  }
389 
390  if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
391  // Sanity checks, intra-thread buffer reuse should only
392  // occur on CPU for group-by queries, which also means
393  // that only one group-by buffer should be allocated
394  // (multiple-buffer allocation only occurs for GPU)
395  CHECK(device_type == ExecutorDeviceType::CPU);
396  CHECK(query_mem_desc.isGroupBy());
397  CHECK_EQ(group_buffers_count, size_t(1));
398  }
399 
400  // Group-by buffer reuse assumes 1 group-by-buffer per query step
401  // Multiple group-by-buffers should only be used on GPU,
402  // whereas buffer reuse only is done on CPU
403  CHECK(group_buffers_count <= 1 || !query_mem_desc.threadsCanReuseGroupByBuffers());
404  for (size_t i = 0; i < group_buffers_count; i += step) {
405  auto group_by_info =
406  alloc_group_by_buffer(actual_group_buffer_size,
407  render_allocator_map,
408  thread_idx_,
409  row_set_mem_owner_.get(),
410  query_mem_desc.threadsCanReuseGroupByBuffers());
411 
412  auto group_by_buffer = group_by_info.first;
413  const bool was_cached = group_by_info.second;
414  if (!was_cached) {
415  if (!query_mem_desc.lazyInitGroups(device_type)) {
416  if (group_by_buffer_template) {
417  memcpy(group_by_buffer + index_buffer_qw,
418  group_by_buffer_template,
419  group_buffer_size);
420  } else {
421  initGroupByBuffer(group_by_buffer + index_buffer_qw,
422  ra_exe_unit,
423  query_mem_desc,
424  agg_op_metadata,
425  device_type,
426  output_columnar,
427  executor);
428  }
429  }
430  }
431 
432  size_t old_size = group_by_buffers_.size();
433  group_by_buffers_.resize(old_size + std::max(size_t(1), step), nullptr);
434  group_by_buffers_[old_size] = group_by_buffer;
435 
436  const bool use_target_exprs_union =
437  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_key);
438  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
439  : ra_exe_unit.target_exprs;
440  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
441  const auto column_frag_sizes =
442  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
443 
444  old_size = result_sets_.size();
445  result_sets_.resize(old_size + std::max(size_t(1), step));
446  result_sets_[old_size] =
447  std::make_unique<ResultSet>(target_exprs_to_infos(target_exprs, query_mem_desc),
448  executor->getColLazyFetchInfo(target_exprs),
449  col_buffers,
450  column_frag_offsets,
451  column_frag_sizes,
452  device_type,
453  device_id,
454  thread_idx,
457  executor->blockSize(),
458  executor->gridSize());
459  result_sets_[old_size]->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
460  executor->plan_state_->init_agg_vals_,
462  }
463 }
464 
465 // Table functions execution constructor
467  const TableFunctionExecutionUnit& exe_unit,
469  const int device_id,
470  const ExecutorDeviceType device_type,
471  const int64_t num_rows,
472  const std::vector<std::vector<const int8_t*>>& col_buffers,
473  const std::vector<std::vector<uint64_t>>& frag_offsets,
474  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
475  DeviceAllocator* device_allocator,
476  const Executor* executor)
477  : num_rows_(num_rows)
478  , row_set_mem_owner_(row_set_mem_owner)
479  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
480  , num_buffers_(1)
487  , device_allocator_(device_allocator)
488  , thread_idx_(0) {
489  // Table functions output columnar, basically treat this as a projection
490  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
491  if (consistent_frag_sizes.empty()) {
492  // No fragments in the input, no underlying buffers will be needed.
493  return;
494  }
495 
496  const size_t num_columns =
497  query_mem_desc.getBufferColSlotCount(); // shouldn't we use getColCount() ???
498  size_t total_group_by_buffer_size{0};
499  for (size_t i = 0; i < num_columns; ++i) {
500  auto ti = exe_unit.target_exprs[i]->get_type_info();
501  if (ti.usesFlatBuffer()) {
502  // See TableFunctionManager.h for info regarding flatbuffer
503  // memory managment.
504  auto slot_idx = query_mem_desc.getSlotIndexForSingleSlotCol(i);
505  CHECK(query_mem_desc.checkSlotUsesFlatBufferFormat(slot_idx));
506  checked_int64_t flatbuffer_size = query_mem_desc.getFlatBufferSize(slot_idx);
507  try {
508  total_group_by_buffer_size = align_to_int64(
509  static_cast<int64_t>(total_group_by_buffer_size + flatbuffer_size));
510  } catch (...) {
511  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
512  }
513  } else {
514  const checked_int64_t col_width = ti.get_size();
515  try {
516  const checked_int64_t group_buffer_size = col_width * num_rows_;
517  total_group_by_buffer_size = align_to_int64(
518  static_cast<int64_t>(group_buffer_size + total_group_by_buffer_size));
519  } catch (...) {
520  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
521  }
522  }
523  }
524 
525 #ifdef __SANITIZE_ADDRESS__
526  // AddressSanitizer will reject allocation sizes above 1 TiB
527 #define MAX_BUFFER_SIZE 0x10000000000ll
528 #else
529  // otherwise, we'll set the limit to 16 TiB, feel free to increase
530  // the limit if needed
531 #define MAX_BUFFER_SIZE 0x100000000000ll
532 #endif
533 
534  if (total_group_by_buffer_size >= MAX_BUFFER_SIZE) {
535  throw OutOfHostMemory(total_group_by_buffer_size);
536  }
537 
538  CHECK_EQ(num_buffers_, size_t(1));
539  auto group_by_buffer = alloc_group_by_buffer(total_group_by_buffer_size,
540  nullptr,
541  thread_idx_,
542  row_set_mem_owner.get(),
543  false)
544  .first;
545  group_by_buffers_.push_back(group_by_buffer);
546 
547  const auto column_frag_offsets =
548  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
549  const auto column_frag_sizes =
550  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
551  result_sets_.emplace_back(
552  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
553  /*col_lazy_fetch_info=*/{},
554  col_buffers,
555  column_frag_offsets,
556  column_frag_sizes,
557  device_type,
558  device_id,
559  -1, /*thread_idx*/
561  row_set_mem_owner_,
562  executor->blockSize(),
563  executor->gridSize()));
564  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
565  init_agg_vals_);
566 }
567 
569  int64_t* buffer,
570  const RelAlgExecutionUnit& ra_exe_unit,
572  TargetAggOpsMetadata& agg_op_metadata,
573  const ExecutorDeviceType device_type,
574  const bool output_columnar,
575  const Executor* executor) {
576  if (output_columnar) {
577  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor, ra_exe_unit);
578  } else {
579  auto rows_ptr = buffer;
580  auto actual_entry_count = query_mem_desc.getEntryCount();
581  const auto thread_count = device_type == ExecutorDeviceType::GPU
582  ? executor->blockSize() * executor->gridSize()
583  : 1;
584  auto warp_size =
585  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
586  if (query_mem_desc.useStreamingTopN()) {
587  const auto node_count_size = thread_count * sizeof(int64_t);
588  memset(rows_ptr, 0, node_count_size);
589  const auto n =
590  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
591  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
592  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
593  rows_ptr += rows_offset / sizeof(int64_t);
594  actual_entry_count = n * thread_count;
595  warp_size = 1;
596  }
597  initRowGroups(query_mem_desc,
598  rows_ptr,
600  agg_op_metadata,
601  actual_entry_count,
602  warp_size,
603  executor,
604  ra_exe_unit);
605  }
606 }
607 
609  int64_t* groups_buffer,
610  const std::vector<int64_t>& init_vals,
611  TargetAggOpsMetadata& agg_op_metadata,
612  const int32_t groups_buffer_entry_count,
613  const size_t warp_size,
614  const Executor* executor,
615  const RelAlgExecutionUnit& ra_exe_unit) {
616  const size_t key_count{query_mem_desc.getGroupbyColCount()};
617  const size_t row_size{query_mem_desc.getRowSize()};
618  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
619 
620  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
621  const auto query_mem_desc_fixedup =
623  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
624  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
625  // we use the default implementation in those agg ops
626  if (!(agg_op_metadata.has_count_distinct || agg_op_metadata.has_mode ||
627  agg_op_metadata.has_tdigest) &&
629  std::vector<int8_t> sample_row(row_size - col_base_off);
630  auto const num_available_cpu_threads =
631  std::min(query_mem_desc.getAvailableCpuThreads(),
632  static_cast<size_t>(std::max(cpu_threads(), 1)));
633  tbb::task_arena initialization_arena(num_available_cpu_threads);
634 
636  query_mem_desc_fixedup, sample_row.data(), init_vals, agg_op_metadata);
637 
638  if (query_mem_desc.hasKeylessHash()) {
639  CHECK(warp_size >= 1);
640  CHECK(key_count == 1 || warp_size == 1);
641  initialization_arena.execute([&] {
643  tbb::blocked_range<size_t>(0, groups_buffer_entry_count * warp_size),
644  [&](const tbb::blocked_range<size_t>& r) {
645  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
646  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
647  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
648  }
649  });
650  });
651  return;
652  }
653  initialization_arena.execute([&] {
655  tbb::blocked_range<size_t>(0, groups_buffer_entry_count),
656  [&](const tbb::blocked_range<size_t>& r) {
657  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
658  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
659  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
660  result_set::fill_empty_key(cur_row_buf, key_count, key_sz);
661  }
662  });
663  });
664  } else {
665  // todo(yoonmin): allow parallelization of `initColumnsPerRow`
666  if (query_mem_desc.hasKeylessHash()) {
667  CHECK(warp_size >= 1);
668  CHECK(key_count == 1 || warp_size == 1);
669  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
670  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
671  ++bin, buffer_ptr += row_size) {
672  initColumnsPerRow(query_mem_desc_fixedup,
673  &buffer_ptr[col_base_off],
674  init_vals,
675  agg_op_metadata);
676  }
677  }
678  return;
679  }
680 
681  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
682  ++bin, buffer_ptr += row_size) {
684  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
686  query_mem_desc_fixedup, &buffer_ptr[col_base_off], init_vals, agg_op_metadata);
687  }
688  }
689 }
690 
691 namespace {
692 
693 template <typename T>
694 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
695  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
696  for (uint32_t i = 0; i < entry_count; ++i) {
697  buffer_ptr[i] = init_val;
698  }
699  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
700 }
701 
702 } // namespace
703 
706  int64_t* groups_buffer,
707  const std::vector<int64_t>& init_vals,
708  const Executor* executor,
709  const RelAlgExecutionUnit& ra_exe_unit) {
710  CHECK(groups_buffer);
711 
712  for (const auto target_expr : ra_exe_unit.target_exprs) {
713  const auto agg_info = get_target_info(target_expr, g_bigint_count);
714  CHECK(!is_distinct_target(agg_info));
715  }
716  const int32_t agg_col_count = query_mem_desc.getSlotCount();
717  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
718 
719  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
720  if (!query_mem_desc.hasKeylessHash()) {
721  const size_t key_count{query_mem_desc.getGroupbyColCount()};
722  for (size_t i = 0; i < key_count; ++i) {
723  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
724  EMPTY_KEY_64,
725  groups_buffer_entry_count);
726  }
727  }
728 
730  // initializing all aggregate columns:
731  int32_t init_val_idx = 0;
732  for (int32_t i = 0; i < agg_col_count; ++i) {
733  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
734  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
735  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
736  case 1:
737  buffer_ptr = initColumnarBuffer<int8_t>(
738  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
739  break;
740  case 2:
741  buffer_ptr =
742  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
743  init_vals[init_val_idx++],
744  groups_buffer_entry_count);
745  break;
746  case 4:
747  buffer_ptr =
748  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
749  init_vals[init_val_idx++],
750  groups_buffer_entry_count);
751  break;
752  case 8:
753  buffer_ptr =
754  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
755  init_vals[init_val_idx++],
756  groups_buffer_entry_count);
757  break;
758  case 0:
759  break;
760  default:
761  CHECK(false);
762  }
763 
764  buffer_ptr = align_to_int64(buffer_ptr);
765  }
766  }
767  }
768 }
769 
772  int8_t* row_ptr,
773  const std::vector<int64_t>& init_vals,
774  const TargetAggOpsMetadata& agg_op_metadata) {
775  int8_t* col_ptr = row_ptr;
776  size_t init_vec_idx = 0;
777  size_t approx_quantile_descriptors_idx = 0;
778  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
779  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
780  int64_t init_val{0};
781  if (query_mem_desc.isGroupBy()) {
782  if (agg_op_metadata.has_count_distinct &&
783  agg_op_metadata.count_distinct_buf_size[col_idx]) {
784  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
785  // create a data structure for count_distinct operator per entries
786  const int64_t bm_sz{agg_op_metadata.count_distinct_buf_size[col_idx]};
787  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
788  sizeof(int64_t));
789  init_val =
791  CHECK_NE(init_val, 0);
792  ++init_vec_idx;
793  } else if (agg_op_metadata.has_tdigest &&
794  agg_op_metadata.quantile_params[col_idx]) {
795  auto const q = *agg_op_metadata.quantile_params[col_idx];
796  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
797  auto const& desc = descs.at(approx_quantile_descriptors_idx++);
798  init_val = reinterpret_cast<int64_t>(
799  row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
800  CHECK_NE(init_val, 0);
801  ++init_vec_idx;
802  } else if (agg_op_metadata.has_mode &&
803  agg_op_metadata.mode_index_set.count(col_idx)) {
804  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->allocateMode());
805  CHECK_NE(init_val, 0);
806  ++init_vec_idx;
807  }
808  }
809  auto const col_slot_width = query_mem_desc.getPaddedSlotWidthBytes(col_idx);
810  if (init_val == 0 && col_slot_width > 0) {
811  CHECK_LT(init_vec_idx, init_vals.size());
812  init_val = init_vals[init_vec_idx++];
813  }
814  switch (col_slot_width) {
815  case 1:
816  *col_ptr = static_cast<int8_t>(init_val);
817  break;
818  case 2:
819  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
820  break;
821  case 4:
822  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
823  break;
824  case 8:
825  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
826  break;
827  case 0:
828  continue;
829  default:
830  CHECK(false);
831  }
832  }
833 }
834 
837  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
838  return;
839  }
841 
842  size_t total_bytes_per_entry{0};
843  const size_t num_count_distinct_descs =
844  query_mem_desc.getCountDistinctDescriptorsSize();
845  for (size_t i = 0; i < num_count_distinct_descs; i++) {
846  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
847  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
848  continue;
849  }
850  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
851  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
852  }
853 
855  total_bytes_per_entry * query_mem_desc.getEntryCount();
859  reinterpret_cast<int8_t*>(count_distinct_bitmap_device_mem_ptr_),
863 }
864 
867  const RelAlgExecutionUnit& ra_exe_unit) const {
868  const size_t agg_col_count{query_mem_desc.getSlotCount()};
869  std::vector<int64_t> agg_bitmap_size(agg_col_count);
870  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
871  ++target_idx) {
872  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
873  const auto agg_info = get_target_info(target_expr, g_bigint_count);
874  if (is_distinct_target(agg_info)) {
875  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
876  const auto& count_distinct_desc =
877  query_mem_desc.getCountDistinctDescriptor(target_idx);
878  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
879  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
880  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
881  } else {
882  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
883  agg_bitmap_size[agg_col_idx] = -1;
884  }
885  }
886  }
887  return agg_bitmap_size;
888 }
889 
892  const RelAlgExecutionUnit& ra_exe_unit) {
893  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
894  ++target_idx) {
895  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
896  const auto agg_info = get_target_info(target_expr, g_bigint_count);
897  if (is_distinct_target(agg_info)) {
898  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
899  const auto& count_distinct_desc =
900  query_mem_desc.getCountDistinctDescriptor(target_idx);
901  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
902  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
903  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
904  } else {
905  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
906  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
907  }
908  }
909  }
910 }
911 
912 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
916  count_distinct_bitmap_host_crt_ptr_ += bitmap_byte_sz;
917  row_set_mem_owner_->addCountDistinctBuffer(
918  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
919  return reinterpret_cast<int64_t>(ptr);
920  }
921  return reinterpret_cast<int64_t>(
922  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
923 }
924 
926  auto count_distinct_set = new CountDistinctSet();
927  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
928  return reinterpret_cast<int64_t>(count_distinct_set);
929 }
930 
933  const RelAlgExecutionUnit& ra_exe_unit) {
934  size_t const slot_count = query_mem_desc.getSlotCount();
935  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
936  ModeIndexSet mode_index_set;
937  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
938  size_t const target_idx) {
939  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
940  CHECK_LT(agg_col_idx, slot_count);
941  mode_index_set.emplace(agg_col_idx);
942  });
943  return mode_index_set;
944 }
945 
948  const RelAlgExecutionUnit& ra_exe_unit) {
949  size_t const slot_count = query_mem_desc.getSlotCount();
950  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
951  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
952  size_t const target_idx) {
953  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
954  CHECK_LT(agg_col_idx, slot_count);
955  AggMode* agg_mode = row_set_mem_owner_->allocateMode();
956  init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
957  });
958 }
959 
960 std::vector<QueryMemoryInitializer::QuantileParam>
963  const RelAlgExecutionUnit& ra_exe_unit) {
964  size_t const slot_count = query_mem_desc.getSlotCount();
965  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
966  std::vector<QuantileParam> quantile_params(slot_count);
967  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
968  size_t const target_idx) {
969  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
970  CHECK_LT(agg_col_idx, slot_count);
971  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
972  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
973  auto const q_expr =
974  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
975  CHECK(q_expr);
976  quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
977  });
978  return quantile_params;
979 }
980 
983  const RelAlgExecutionUnit& ra_exe_unit) {
984  size_t const slot_count = query_mem_desc.getSlotCount();
985  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
986 
987  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
988  size_t approx_quantile_descriptors_idx = 0u;
989  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
990  size_t const target_idx) {
991  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
992  CHECK_LT(agg_col_idx, slot_count);
993  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
994  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
995  auto const q_expr =
996  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
997  CHECK(q_expr);
998  auto const q = q_expr->get_constval().doubleval;
999  auto const& desc = descs.at(approx_quantile_descriptors_idx++);
1000  init_agg_vals_[agg_col_idx] =
1001  reinterpret_cast<int64_t>(row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
1002  });
1003 }
1004 
1007  const int8_t* init_agg_vals_dev_ptr,
1008  const size_t n,
1009  const int device_id,
1010  const unsigned block_size_x,
1011  const unsigned grid_size_x) {
1012 #ifdef HAVE_CUDA
1014  const auto thread_count = block_size_x * grid_size_x;
1015  const auto total_buff_size =
1016  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1017  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
1018 
1019  std::vector<int8_t*> dev_buffers(thread_count);
1020 
1021  for (size_t i = 0; i < thread_count; ++i) {
1022  dev_buffers[i] = dev_buffer;
1023  }
1024 
1025  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
1027  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
1028 
1029  CHECK(query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU));
1030 
1031  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
1032  thread_count * sizeof(int64_t));
1033 
1035  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
1036  (unsigned char)-1,
1037  thread_count * n * sizeof(int64_t));
1038 
1040  reinterpret_cast<int64_t*>(
1041  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
1042  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1043  n * thread_count,
1044  query_mem_desc.getGroupbyColCount(),
1045  query_mem_desc.getEffectiveKeyWidth(),
1046  query_mem_desc.getRowSize() / sizeof(int64_t),
1047  query_mem_desc.hasKeylessHash(),
1048  1,
1049  block_size_x,
1050  grid_size_x);
1051 
1052  return {dev_ptr, dev_buffer};
1053 #else
1054  UNREACHABLE();
1055  return {};
1056 #endif
1057 }
1058 
1060  const RelAlgExecutionUnit& ra_exe_unit,
1062  const int8_t* init_agg_vals_dev_ptr,
1063  const int device_id,
1064  const ExecutorDispatchMode dispatch_mode,
1065  const unsigned block_size_x,
1066  const unsigned grid_size_x,
1067  const int8_t warp_size,
1068  const bool can_sort_on_gpu,
1069  const bool output_columnar,
1070  RenderAllocator* render_allocator) {
1071 #ifdef HAVE_CUDA
1072  if (query_mem_desc.useStreamingTopN()) {
1073  if (render_allocator) {
1075  }
1076  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
1077  CHECK(!output_columnar);
1078 
1080  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
1081  }
1082 
1083  auto dev_group_by_buffers =
1086  query_mem_desc,
1087  block_size_x,
1088  grid_size_x,
1089  device_id,
1090  dispatch_mode,
1091  num_rows_,
1092  can_sort_on_gpu,
1093  false,
1094  ra_exe_unit.use_bump_allocator,
1095  query_mem_desc.hasVarlenOutput(),
1096  render_allocator);
1097  if (query_mem_desc.hasVarlenOutput()) {
1098  CHECK(dev_group_by_buffers.varlen_output_buffer);
1100  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
1101  CHECK(query_mem_desc.varlenOutputBufferElemSize());
1102  const size_t varlen_output_buf_bytes =
1103  query_mem_desc.getEntryCount() *
1104  query_mem_desc.varlenOutputBufferElemSize().value();
1106  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
1108  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
1110  }
1111  if (render_allocator) {
1112  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
1113  }
1114  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
1115  CHECK(!render_allocator);
1116 
1117  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
1118  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
1119  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
1120  auto group_by_dev_buffer = dev_group_by_buffers.data;
1121  const size_t col_count = query_mem_desc.getSlotCount();
1122  int8_t* col_widths_dev_ptr{nullptr};
1123  if (output_columnar) {
1124  std::vector<int8_t> compact_col_widths(col_count);
1125  for (size_t idx = 0; idx < col_count; ++idx) {
1126  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
1127  }
1128  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
1130  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
1131  }
1132  const int8_t warp_count =
1133  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
1134  const auto num_group_by_buffers =
1135  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
1136  for (size_t i = 0; i < num_group_by_buffers; i += step) {
1137  if (output_columnar) {
1139  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1140  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1141  dev_group_by_buffers.entry_count,
1142  query_mem_desc.getGroupbyColCount(),
1143  col_count,
1144  col_widths_dev_ptr,
1145  /*need_padding = */ true,
1146  query_mem_desc.hasKeylessHash(),
1147  sizeof(int64_t),
1148  block_size_x,
1149  grid_size_x);
1150  } else {
1152  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1153  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1154  dev_group_by_buffers.entry_count,
1155  query_mem_desc.getGroupbyColCount(),
1156  query_mem_desc.getEffectiveKeyWidth(),
1157  query_mem_desc.getRowSize() / sizeof(int64_t),
1158  query_mem_desc.hasKeylessHash(),
1159  warp_count,
1160  block_size_x,
1161  grid_size_x);
1162  }
1163  group_by_dev_buffer += groups_buffer_size;
1164  }
1165  }
1166  return dev_group_by_buffers;
1167 #else
1168  UNREACHABLE();
1169  return {};
1170 #endif
1171 }
1172 
1175  const int device_id,
1176  const unsigned block_size_x,
1177  const unsigned grid_size_x,
1178  const bool zero_initialize_buffers) {
1179  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1180  CHECK_GT(num_columns, size_t(0));
1181  size_t total_group_by_buffer_size{0};
1182  const auto col_slot_context = query_mem_desc.getColSlotContext();
1183 
1184  std::vector<size_t> col_byte_offsets;
1185  col_byte_offsets.reserve(num_columns);
1186 
1187  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1188  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1189  size_t group_buffer_size = num_rows_ * col_width;
1190  col_byte_offsets.emplace_back(total_group_by_buffer_size);
1191  total_group_by_buffer_size =
1192  align_to_int64(total_group_by_buffer_size + group_buffer_size);
1193  }
1194 
1195  int8_t* dev_buffers_allocation{nullptr};
1196  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
1197  CHECK(dev_buffers_allocation);
1198  if (zero_initialize_buffers) {
1199  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
1200  }
1201 
1202  auto dev_buffers_mem = dev_buffers_allocation;
1203  std::vector<int8_t*> dev_buffers(num_columns);
1204  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1205  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1206  }
1207  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
1209  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
1210 
1211  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
1212 }
1213 
1215  Data_Namespace::DataMgr* data_mgr,
1216  const QueryMemoryDescriptor& query_mem_desc,
1217  const size_t entry_count,
1218  const GpuGroupByBuffers& gpu_group_by_buffers,
1219  const int device_id,
1220  const unsigned block_size_x,
1221  const unsigned grid_size_x) {
1222  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1223 
1224  int8_t* dev_buffer = gpu_group_by_buffers.data;
1225  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1226 
1227  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1228  CHECK_LE(entry_count, original_entry_count);
1229  size_t output_device_col_offset{0};
1230  size_t output_host_col_offset{0};
1231 
1232  const auto col_slot_context = query_mem_desc.getColSlotContext();
1233 
1234  auto allocator = std::make_unique<CudaAllocator>(
1235  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1236 
1237  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1238  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1239  const size_t output_device_col_size = original_entry_count * col_width;
1240  const size_t output_host_col_size = entry_count * col_width;
1241  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1242  dev_buffer + output_device_col_offset,
1243  output_host_col_size);
1244  output_device_col_offset =
1245  align_to_int64(output_device_col_offset + output_device_col_size);
1246  output_host_col_offset =
1247  align_to_int64(output_host_col_offset + output_host_col_size);
1248  }
1249 }
1250 
1252  const QueryMemoryDescriptor& query_mem_desc,
1253  const ExecutorDeviceType device_type,
1254  const Executor* executor) const {
1255  return device_type == ExecutorDeviceType::CPU
1256  ? 1
1257  : executor->blockSize() *
1258  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1259 }
1260 
1261 namespace {
1262 
1263 // in-place compaction of output buffer
1265  const QueryMemoryDescriptor& query_mem_desc,
1266  int8_t* projection_buffer,
1267  const size_t projection_count) {
1268  // the first column (row indices) remains unchanged.
1269  CHECK(projection_count <= query_mem_desc.getEntryCount());
1270  constexpr size_t row_index_width = sizeof(int64_t);
1271  size_t buffer_offset1{projection_count * row_index_width};
1272  // other columns are actual non-lazy columns for the projection:
1273  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1274  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1275  auto column_proj_size =
1276  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1277  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1278  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1279  // overlapping
1280  std::memmove(projection_buffer + buffer_offset1,
1281  projection_buffer + buffer_offset2,
1282  column_proj_size);
1283  } else {
1284  std::memcpy(projection_buffer + buffer_offset1,
1285  projection_buffer + buffer_offset2,
1286  column_proj_size);
1287  }
1288  buffer_offset1 += align_to_int64(column_proj_size);
1289  }
1290  }
1291 }
1292 
1293 } // namespace
1294 
1296  const QueryMemoryDescriptor& query_mem_desc,
1297  const size_t projection_count) {
1298  const auto num_allocated_rows =
1299  std::min(projection_count, query_mem_desc.getEntryCount());
1300  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1301 
1302  // copy the results from the main buffer into projection_buffer
1304  query_mem_desc,
1305  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1306  num_allocated_rows);
1307 
1308  // update the entry count for the result set, and its underlying storage
1309  CHECK(!result_sets_.empty());
1310  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1311 }
1312 
1314  const QueryMemoryDescriptor& query_mem_desc,
1315  Data_Namespace::DataMgr* data_mgr,
1316  const GpuGroupByBuffers& gpu_group_by_buffers,
1317  const size_t projection_count,
1318  const int device_id) {
1319  // store total number of allocated rows:
1320  const auto num_allocated_rows =
1321  std::min(projection_count, query_mem_desc.getEntryCount());
1322 
1323  // copy the results from the main buffer into projection_buffer
1324  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1326  data_mgr,
1327  gpu_group_by_buffers,
1328  query_mem_desc,
1329  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1330  num_allocated_rows,
1331  device_id);
1332 
1333  // update the entry count for the result set, and its underlying storage
1334  CHECK(!result_sets_.empty());
1335  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1336 }
1337 
1339  DeviceAllocator& device_allocator,
1340  const QueryMemoryDescriptor& query_mem_desc,
1341  const size_t entry_count,
1342  const GpuGroupByBuffers& gpu_group_by_buffers,
1343  const RelAlgExecutionUnit* ra_exe_unit,
1344  const unsigned block_size_x,
1345  const unsigned grid_size_x,
1346  const int device_id,
1347  const bool prepend_index_buffer) const {
1348  const auto thread_count = block_size_x * grid_size_x;
1349 
1350  size_t total_buff_size{0};
1351  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1352  const size_t n =
1353  ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit.value_or(0);
1354  total_buff_size =
1355  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1356  } else {
1357  total_buff_size =
1358  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1359  }
1360  copy_group_by_buffers_from_gpu(device_allocator,
1362  total_buff_size,
1363  gpu_group_by_buffers.data,
1364  query_mem_desc,
1365  block_size_x,
1366  grid_size_x,
1367  device_id,
1368  prepend_index_buffer,
1369  query_mem_desc.hasVarlenOutput());
1370 }
1371 
1373  const QueryMemoryDescriptor& query_mem_desc,
1374  const RelAlgExecutionUnit& ra_exe_unit) {
1375  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1376  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1377 
1378  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1379  group_by_buffers_[buffer_start_idx],
1380  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1381  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0),
1382  1);
1383  CHECK_EQ(rows_copy.size(),
1384  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1385  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1386 }
1387 
1389  Data_Namespace::DataMgr* data_mgr,
1390  const QueryMemoryDescriptor& query_mem_desc,
1391  const GpuGroupByBuffers& gpu_group_by_buffers,
1392  const RelAlgExecutionUnit& ra_exe_unit,
1393  const unsigned total_thread_count,
1394  const int device_id) {
1395 #ifdef HAVE_CUDA
1397  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1398 
1399  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1400  data_mgr,
1401  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1402  ra_exe_unit,
1403  query_mem_desc,
1404  total_thread_count,
1405  device_id);
1406  CHECK_EQ(
1407  rows_copy.size(),
1408  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1409  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1410 #else
1411  UNREACHABLE();
1412 #endif
1413 }
1414 
1415 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1416  if (varlen_output_info_) {
1417  return varlen_output_info_;
1418  }
1419 
1420  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1421  // and update it as needed
1422  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1423  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1424  return varlen_output_info_;
1425 }
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
RenderAllocator * getRenderAllocator(size_t device_id)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
#define EMPTY_KEY_64
int8_t logical_size
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
QueryMemoryInitializer::TargetAggOpsMetadata collect_target_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
size_t getAvailableCpuThreads() const
count_distinct_bitmap_host_mem_ptr_(nullptr)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
Streaming Top N algorithm.
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:28
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
count_distinct_bitmap_device_mem_ptr_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
Projection
Definition: enums.h:58
const ApproxQuantileDescriptors & getApproxQuantileDescriptors() const
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:305
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
count_distinct_bitmap_mem_size_(0)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::pair< int64_t *, bool > allocateCachedGroupByBuffer(const size_t num_bytes, const size_t thread_idx)
void check_count_distinct_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
ExecutorDispatchMode
int8_t * allocate(const size_t num_bytes) override
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
std::pair< int64_t *, bool > alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner, const bool reuse_existing_buffer_for_thread)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
static IndexType nbytes(IndexType buf_allocate, IndexType centroids_allocate)
Definition: quantile.h:261
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
#define CHECK_NE(x, y)
Definition: Logger.h:302
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool threadsCanReuseGroupByBuffers() const
std::optional< size_t > limit
count_distinct_bitmap_host_crt_ptr_(nullptr)
bool g_bigint_count
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
int64_t g_bitmap_memory_limit
size_t g_max_memory_allocation_size
Definition: Execute.cpp:128
size_t operator()(size_t const sum, ApproxQuantileDescriptor const aqd) const
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
#define CHECK_LE(x, y)
Definition: Logger.h:304
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
Definition: sqldefs.h:81
robin_hood::unordered_set< size_t > ModeIndexSet
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
#define MAX_BUFFER_SIZE
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:291
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:108
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
int cpu_threads()
Definition: thread_count.h:25
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
Definition: sqldefs.h:86
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
#define VLOG(n)
Definition: Logger.h:388
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)