OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryDescriptor.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryDescriptor.h"
18 
19 #include "../Execute.h"
20 #include "../ExpressionRewrite.h"
21 #include "../GroupByAndAggregate.h"
22 #include "../StreamingTopN.h"
23 #include "../UsedColumnsVisitor.h"
24 #include "ColSlotContext.h"
25 
26 #include <boost/algorithm/cxx11/any_of.hpp>
27 
29 extern bool g_enable_columnar_output;
30 extern size_t g_streaming_topn_max;
31 
32 namespace {
33 
34 bool is_int_and_no_bigger_than(const SQLTypeInfo& ti, const size_t byte_width) {
35  if (!ti.is_integer()) {
36  return false;
37  }
38  return get_bit_width(ti) <= (byte_width * 8);
39 }
40 
42  return range.getIntMin() > INT32_MIN && range.getIntMax() < EMPTY_KEY_32 - 1;
43 }
44 
45 std::vector<int64_t> target_expr_group_by_indices(
46  const std::list<std::shared_ptr<Analyzer::Expr>>& groupby_exprs,
47  const std::vector<Analyzer::Expr*>& target_exprs) {
48  std::vector<int64_t> indices(target_exprs.size(), -1);
49  for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
50  const auto target_expr = target_exprs[target_idx];
51  if (dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
52  continue;
53  }
54  const auto var_expr = dynamic_cast<const Analyzer::Var*>(target_expr);
55  if (var_expr && var_expr->get_which_row() == Analyzer::Var::kGROUPBY) {
56  indices[target_idx] = var_expr->get_varno() - 1;
57  continue;
58  }
59  }
60  return indices;
61 }
62 
63 std::vector<int64_t> target_expr_proj_indices(const RelAlgExecutionUnit& ra_exe_unit) {
64  if (ra_exe_unit.input_descs.size() > 1 ||
65  !ra_exe_unit.sort_info.order_entries.empty()) {
66  return {};
67  }
68  std::vector<int64_t> target_indices(ra_exe_unit.target_exprs.size(), -1);
69  UsedColumnsVisitor columns_visitor;
70  std::unordered_set<shared::ColumnKey> used_columns;
71  for (const auto& simple_qual : ra_exe_unit.simple_quals) {
72  const auto crt_used_columns = columns_visitor.visit(simple_qual.get());
73  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
74  }
75  for (const auto& qual : ra_exe_unit.quals) {
76  const auto crt_used_columns = columns_visitor.visit(qual.get());
77  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
78  }
79  for (const auto& target : ra_exe_unit.target_exprs) {
80  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target);
81  if (col_var) {
82  const auto cd = get_column_descriptor_maybe(col_var->getColumnKey());
83  if (!cd || !cd->isVirtualCol) {
84  continue;
85  }
86  }
87  const auto crt_used_columns = columns_visitor.visit(target);
88  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
89  }
90  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
91  ++target_idx) {
92  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
93  CHECK(target_expr);
94  const auto& ti = target_expr->get_type_info();
95  // TODO: add proper lazy fetch for varlen types in result set
96  if (ti.is_varlen()) {
97  continue;
98  }
99  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
100  if (!col_var) {
101  continue;
102  }
103  if (!ti.is_varlen() &&
104  used_columns.find(col_var->getColumnKey()) == used_columns.end()) {
105  // setting target index to be zero so that later it can be decoded properly (in lazy
106  // fetch, the zeroth target index indicates the corresponding rowid column for the
107  // projected entry)
108  target_indices[target_idx] = 0;
109  }
110  }
111  return target_indices;
112 }
113 
115  const size_t group_col_width) {
116  if (range.getType() == ExpressionRangeType::Invalid) {
117  return sizeof(int64_t);
118  }
119  switch (range.getType()) {
121  if (group_col_width == sizeof(int64_t) && range.hasNulls()) {
122  return sizeof(int64_t);
123  }
124  return is_valid_int32_range(range) ? sizeof(int32_t) : sizeof(int64_t);
127  return sizeof(int64_t); // No compaction for floating point yet.
128  default:
129  UNREACHABLE();
130  }
131  return sizeof(int64_t);
132 }
133 
134 // TODO(miyu): make sure following setting of compact width is correct in all cases.
136  const std::vector<InputTableInfo>& query_infos,
137  const Executor* executor) {
138  int8_t compact_width{4};
139  for (const auto& groupby_expr : ra_exe_unit.groupby_exprs) {
140  const auto expr_range = getExpressionRange(groupby_expr.get(), query_infos, executor);
141  compact_width = std::max(compact_width,
143  expr_range, groupby_expr->get_type_info().get_size()));
144  }
145  return compact_width;
146 }
147 
148 bool use_streaming_top_n(const RelAlgExecutionUnit& ra_exe_unit,
149  const bool output_columnar) {
150  if (g_cluster) {
151  return false; // TODO(miyu)
152  }
153 
154  for (const auto target_expr : ra_exe_unit.target_exprs) {
155  if (dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
156  return false;
157  }
158  if (dynamic_cast<const Analyzer::WindowFunction*>(target_expr)) {
159  return false;
160  }
161  }
162 
163  // TODO: Allow streaming top n for columnar output
164  auto limit_value = ra_exe_unit.sort_info.limit.value_or(0);
165  if (!output_columnar && ra_exe_unit.sort_info.order_entries.size() == 1 &&
166  limit_value > 0 &&
168  const auto only_order_entry = ra_exe_unit.sort_info.order_entries.front();
169  CHECK_GT(only_order_entry.tle_no, int(0));
170  CHECK_LE(static_cast<size_t>(only_order_entry.tle_no),
171  ra_exe_unit.target_exprs.size());
172  const auto order_entry_expr = ra_exe_unit.target_exprs[only_order_entry.tle_no - 1];
173  const auto n = ra_exe_unit.sort_info.offset + limit_value;
174  if ((order_entry_expr->get_type_info().is_number() ||
175  order_entry_expr->get_type_info().is_time()) &&
176  n <= g_streaming_topn_max) {
177  return true;
178  }
179  }
180 
181  return false;
182 }
183 
184 template <class T>
185 inline std::vector<int8_t> get_col_byte_widths(const T& col_expr_list) {
186  std::vector<int8_t> col_widths;
187  size_t col_expr_idx = 0;
188  for (const auto& col_expr : col_expr_list) {
189  if (!col_expr) {
190  // row index
191  col_widths.push_back(sizeof(int64_t));
192  } else {
193  bool is_varlen_projection{false};
194  if constexpr (std::is_same<T, std::list<std::shared_ptr<Analyzer::Expr>>>::value) {
196  !(std::dynamic_pointer_cast<const Analyzer::GeoExpr>(col_expr) == nullptr);
197  } else {
199  !(dynamic_cast<const Analyzer::GeoExpr*>(col_expr) == nullptr);
200  }
201 
202  if (is_varlen_projection) {
203  col_widths.push_back(sizeof(int64_t));
204  ++col_expr_idx;
205  continue;
206  }
207  const auto agg_info = get_target_info(col_expr, g_bigint_count);
208  const auto chosen_type = get_compact_type(agg_info);
209  if ((chosen_type.is_string() && chosen_type.get_compression() == kENCODING_NONE) ||
210  chosen_type.is_array()) {
211  col_widths.push_back(sizeof(int64_t));
212  col_widths.push_back(sizeof(int64_t));
213  ++col_expr_idx;
214  continue;
215  }
216  if (chosen_type.is_geometry()) {
217  for (auto i = 0; i < chosen_type.get_physical_coord_cols(); ++i) {
218  col_widths.push_back(sizeof(int64_t));
219  col_widths.push_back(sizeof(int64_t));
220  }
221  ++col_expr_idx;
222  continue;
223  }
224  const auto col_expr_bitwidth = get_bit_width(chosen_type);
225  CHECK_EQ(size_t(0), col_expr_bitwidth % 8);
226  col_widths.push_back(static_cast<int8_t>(col_expr_bitwidth >> 3));
227  // for average, we'll need to keep the count as well
228  if (agg_info.agg_kind == kAVG) {
229  CHECK(agg_info.is_agg);
230  col_widths.push_back(sizeof(int64_t));
231  }
232  }
233  ++col_expr_idx;
234  }
235  return col_widths;
236 }
237 
238 } // namespace
239 
240 std::unique_ptr<QueryMemoryDescriptor> QueryMemoryDescriptor::init(
241  const Executor* executor,
242  const RelAlgExecutionUnit& ra_exe_unit,
243  const std::vector<InputTableInfo>& query_infos,
244  const ColRangeInfo& col_range_info,
245  const KeylessInfo& keyless_info,
246  const bool allow_multifrag,
247  const ExecutorDeviceType device_type,
248  const int8_t crt_min_byte_width,
249  const bool sort_on_gpu_hint,
250  const size_t shard_count,
251  const size_t max_groups_buffer_entry_count,
252  RenderInfo* render_info,
253  const ApproxQuantileDescriptors& approx_quantile_descriptors,
254  const CountDistinctDescriptors count_distinct_descriptors,
255  const bool must_use_baseline_sort,
256  const bool output_columnar_hint,
257  const bool streaming_top_n_hint,
258  const bool threads_can_reuse_group_by_buffers) {
259  auto group_col_widths = get_col_byte_widths(ra_exe_unit.groupby_exprs);
260  const bool is_group_by{!group_col_widths.empty()};
261 
262  auto col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, {});
263 
264  const auto min_slot_size = QueryMemoryDescriptor::pick_target_compact_width(
265  ra_exe_unit, query_infos, crt_min_byte_width);
266 
267  col_slot_context.setAllSlotsPaddedSize(min_slot_size);
268  col_slot_context.validate();
269 
270  if (!is_group_by) {
271  CHECK(!must_use_baseline_sort);
272 
273  return std::make_unique<QueryMemoryDescriptor>(
274  executor,
275  ra_exe_unit,
276  query_infos,
277  allow_multifrag,
278  false,
279  false,
280  -1,
281  ColRangeInfo{ra_exe_unit.estimator ? QueryDescriptionType::Estimator
283  0,
284  0,
285  0,
286  false},
287  col_slot_context,
288  std::vector<int8_t>{},
289  /*group_col_compact_width=*/0,
290  std::vector<int64_t>{},
291  /*entry_count=*/1,
292  approx_quantile_descriptors,
293  count_distinct_descriptors,
294  false,
295  output_columnar_hint,
296  render_info && render_info->isInSitu(),
297  must_use_baseline_sort,
298  /*use_streaming_top_n=*/false,
299  threads_can_reuse_group_by_buffers);
300  }
301 
302  size_t entry_count = 1;
303  auto actual_col_range_info = col_range_info;
304  bool interleaved_bins_on_gpu = false;
305  bool keyless_hash = false;
306  bool streaming_top_n = false;
307  int8_t group_col_compact_width = 0;
308  int32_t idx_target_as_key = -1;
309  auto output_columnar = output_columnar_hint;
310  std::vector<int64_t> target_groupby_indices;
311 
312  switch (col_range_info.hash_type_) {
314  if (render_info) {
315  // TODO(croot): this can be removed now thanks to the more centralized
316  // NonInsituQueryClassifier code, but keeping it just in case
317  render_info->setNonInSitu();
318  }
319  // keyless hash: whether or not group columns are stored at the beginning of the
320  // output buffer
321  keyless_hash =
322  (!sort_on_gpu_hint ||
324  col_range_info.max, col_range_info.min, col_range_info.bucket)) &&
325  !col_range_info.bucket && !must_use_baseline_sort && keyless_info.keyless;
326 
327  // if keyless, then this target index indicates wheter an entry is empty or not
328  // (acts as a key)
329  idx_target_as_key = keyless_info.target_index;
330 
331  if (group_col_widths.size() > 1) {
332  // col range info max contains the expected cardinality of the output
333  entry_count = static_cast<size_t>(actual_col_range_info.max);
334  actual_col_range_info.bucket = 0;
335  } else {
336  // single column perfect hash
337  entry_count = std::max(
338  GroupByAndAggregate::getBucketedCardinality(col_range_info), int64_t(1));
339  const size_t interleaved_max_threshold{512};
340 
341  if (must_use_baseline_sort) {
342  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
343  ra_exe_unit.target_exprs);
344  col_slot_context =
345  ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
346  }
347 
348  bool has_varlen_sample_agg = false;
349  for (const auto& target_expr : ra_exe_unit.target_exprs) {
350  if (target_expr->get_contains_agg()) {
351  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
352  CHECK(agg_expr);
353  if (agg_expr->get_aggtype() == kSAMPLE &&
354  agg_expr->get_type_info().is_varlen()) {
355  has_varlen_sample_agg = true;
356  break;
357  }
358  }
359  }
360 
361  interleaved_bins_on_gpu = keyless_hash && !has_varlen_sample_agg &&
362  (entry_count <= interleaved_max_threshold) &&
363  (device_type == ExecutorDeviceType::GPU) &&
365  count_distinct_descriptors) &&
366  !output_columnar;
367  }
368  break;
369  }
371  if (render_info) {
372  // TODO(croot): this can be removed now thanks to the more centralized
373  // NonInsituQueryClassifier code, but keeping it just in case
374  render_info->setNonInSitu();
375  }
376  entry_count = shard_count
377  ? (max_groups_buffer_entry_count + shard_count - 1) / shard_count
378  : max_groups_buffer_entry_count;
379  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
380  ra_exe_unit.target_exprs);
381  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
382 
383  group_col_compact_width =
384  output_columnar ? 8
385  : pick_baseline_key_width(ra_exe_unit, query_infos, executor);
386 
387  actual_col_range_info =
389  break;
390  }
392  CHECK(!must_use_baseline_sort);
393 
394  if (streaming_top_n_hint && use_streaming_top_n(ra_exe_unit, output_columnar)) {
395  streaming_top_n = true;
396  entry_count =
397  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
398  } else {
399  if (ra_exe_unit.use_bump_allocator) {
400  output_columnar = false;
401  entry_count = 0;
402  } else {
403  entry_count = ra_exe_unit.scan_limit
404  ? static_cast<size_t>(ra_exe_unit.scan_limit)
405  : max_groups_buffer_entry_count;
406  }
407  }
408 
409  target_groupby_indices = executor->plan_state_->allow_lazy_fetch_
410  ? target_expr_proj_indices(ra_exe_unit)
411  : std::vector<int64_t>{};
412 
413  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
414  break;
415  }
416  default:
417  UNREACHABLE() << "Unknown query type";
418  }
419 
420  return std::make_unique<QueryMemoryDescriptor>(executor,
421  ra_exe_unit,
422  query_infos,
423  allow_multifrag,
424  keyless_hash,
425  interleaved_bins_on_gpu,
426  idx_target_as_key,
427  actual_col_range_info,
428  col_slot_context,
429  group_col_widths,
430  group_col_compact_width,
431  target_groupby_indices,
432  entry_count,
433  approx_quantile_descriptors,
434  count_distinct_descriptors,
435  sort_on_gpu_hint,
436  output_columnar,
437  render_info && render_info->isInSitu(),
438  must_use_baseline_sort,
439  streaming_top_n,
440  threads_can_reuse_group_by_buffers);
441 }
442 
443 namespace {
444 template <SQLAgg... agg_types>
445 bool any_of(std::vector<Analyzer::Expr*> const& target_exprs) {
446  return boost::algorithm::any_of(target_exprs, [=](Analyzer::Expr const* expr) {
447  auto const* const agg = dynamic_cast<Analyzer::AggExpr const*>(expr);
448  return agg && (... || (agg_types == agg->get_aggtype()));
449  });
450 }
451 } // namespace
452 
454  const Executor* executor,
455  const RelAlgExecutionUnit& ra_exe_unit,
456  const std::vector<InputTableInfo>& query_infos,
457  const bool allow_multifrag,
458  const bool keyless_hash,
459  const bool interleaved_bins_on_gpu,
460  const int32_t idx_target_as_key,
461  const ColRangeInfo& col_range_info,
462  const ColSlotContext& col_slot_context,
463  const std::vector<int8_t>& group_col_widths,
464  const int8_t group_col_compact_width,
465  const std::vector<int64_t>& target_groupby_indices,
466  const size_t entry_count,
467  const ApproxQuantileDescriptors& approx_quantile_descriptors,
468  const CountDistinctDescriptors count_distinct_descriptors,
469  const bool sort_on_gpu_hint,
470  const bool output_columnar_hint,
471  const bool render_output,
472  const bool must_use_baseline_sort,
473  const bool use_streaming_top_n,
474  const bool threads_can_reuse_group_by_buffers)
475  : executor_(executor)
476  , allow_multifrag_(allow_multifrag)
477  , query_desc_type_(col_range_info.hash_type_)
478  , keyless_hash_(keyless_hash)
479  , interleaved_bins_on_gpu_(interleaved_bins_on_gpu)
480  , idx_target_as_key_(idx_target_as_key)
481  , group_col_widths_(group_col_widths)
482  , group_col_compact_width_(group_col_compact_width)
483  , target_groupby_indices_(target_groupby_indices)
484  , entry_count_(entry_count)
485  , min_val_(col_range_info.min)
486  , max_val_(col_range_info.max)
487  , bucket_(col_range_info.bucket)
488  , has_nulls_(col_range_info.has_nulls)
489  , approx_quantile_descriptors_(approx_quantile_descriptors)
490  , count_distinct_descriptors_(count_distinct_descriptors)
491  , output_columnar_(false)
492  , render_output_(render_output)
493  , must_use_baseline_sort_(must_use_baseline_sort)
494  , use_streaming_top_n_(use_streaming_top_n)
495  , threads_can_reuse_group_by_buffers_(threads_can_reuse_group_by_buffers)
496  , force_4byte_float_(false)
497  , col_slot_context_(col_slot_context)
498  , num_available_threads_(cpu_threads()) {
502 
503  sort_on_gpu_ = sort_on_gpu_hint && canOutputColumnar() && !keyless_hash_;
504  if (sort_on_gpu_) {
505  CHECK(!ra_exe_unit.use_bump_allocator);
506  output_columnar_ = true;
507  } else {
508  switch (query_desc_type_) {
510  output_columnar_ = output_columnar_hint;
511  break;
513  output_columnar_ = output_columnar_hint &&
516  !any_of<kAPPROX_QUANTILE, kMODE>(ra_exe_unit.target_exprs);
517  break;
519  output_columnar_ = output_columnar_hint;
520  break;
522  output_columnar_ = output_columnar_hint &&
525  !any_of<kAPPROX_QUANTILE, kMODE>(ra_exe_unit.target_exprs);
526  break;
527  default:
528  output_columnar_ = false;
529  break;
530  }
531  }
532 
534  // TODO(adb): Ensure fixed size buffer allocations are correct with all logical column
535  // sizes
536  CHECK(!ra_exe_unit.use_bump_allocator);
539  }
540 
541 #ifdef HAVE_CUDA
542  // Check Streaming Top N heap usage, bail if > max slab size, CUDA ONLY
543  if (use_streaming_top_n_ && executor->getDataMgr()->gpusPresent()) {
544  const auto thread_count = executor->blockSize() * executor->gridSize();
545  const auto total_buff_size =
547  if (total_buff_size > executor_->maxGpuSlabSize()) {
548  throw StreamingTopNOOM(total_buff_size);
549  }
550  }
551 #endif
552 }
553 
555  : executor_(nullptr)
556  , allow_multifrag_(false)
557  , query_desc_type_(QueryDescriptionType::Projection)
558  , keyless_hash_(false)
559  , interleaved_bins_on_gpu_(false)
560  , idx_target_as_key_(0)
561  , group_col_compact_width_(0)
562  , entry_count_(0)
563  , min_val_(0)
564  , max_val_(0)
565  , bucket_(0)
566  , has_nulls_(false)
567  , sort_on_gpu_(false)
568  , output_columnar_(false)
569  , render_output_(false)
570  , must_use_baseline_sort_(false)
571  , use_streaming_top_n_(false)
572  , threads_can_reuse_group_by_buffers_(false)
573  , force_4byte_float_(false) {}
574 
576  const size_t entry_count,
577  const QueryDescriptionType query_desc_type)
578  : executor_(executor)
579  , allow_multifrag_(false)
580  , query_desc_type_(query_desc_type)
581  , keyless_hash_(false)
582  , interleaved_bins_on_gpu_(false)
583  , idx_target_as_key_(0)
584  , group_col_compact_width_(0)
585  , entry_count_(entry_count)
586  , min_val_(0)
587  , max_val_(0)
588  , bucket_(0)
589  , has_nulls_(false)
590  , sort_on_gpu_(false)
591  , output_columnar_(false)
592  , render_output_(false)
593  , must_use_baseline_sort_(false)
594  , use_streaming_top_n_(false)
595  , threads_can_reuse_group_by_buffers_(false)
596  , force_4byte_float_(false)
597  , num_available_threads_(cpu_threads()) {
598  if (query_desc_type == QueryDescriptionType::TableFunction) {
599  // Table functions output columns are always columnar
600  output_columnar_ = true;
601  }
602 }
603 
604 QueryMemoryDescriptor::QueryMemoryDescriptor(const QueryDescriptionType query_desc_type,
605  const int64_t min_val,
606  const int64_t max_val,
607  const bool has_nulls,
608  const std::vector<int8_t>& group_col_widths)
609  : executor_(nullptr)
610  , allow_multifrag_(false)
611  , query_desc_type_(query_desc_type)
612  , keyless_hash_(false)
613  , interleaved_bins_on_gpu_(false)
614  , idx_target_as_key_(0)
615  , group_col_widths_(group_col_widths)
616  , group_col_compact_width_(0)
617  , entry_count_(0)
618  , min_val_(min_val)
619  , max_val_(max_val)
620  , bucket_(0)
621  , has_nulls_(false)
622  , sort_on_gpu_(false)
623  , output_columnar_(false)
624  , render_output_(false)
625  , must_use_baseline_sort_(false)
626  , use_streaming_top_n_(false)
627  , threads_can_reuse_group_by_buffers_(false)
628  , force_4byte_float_(false)
629  , num_available_threads_(cpu_threads()) {}
630 
632  // Note that this method does not check ptr reference members (e.g. executor_) or
633  // entry_count_
634  if (query_desc_type_ != other.query_desc_type_) {
635  return false;
636  }
637  if (keyless_hash_ != other.keyless_hash_) {
638  return false;
639  }
641  return false;
642  }
643  if (idx_target_as_key_ != other.idx_target_as_key_) {
644  return false;
645  }
646  if (force_4byte_float_ != other.force_4byte_float_) {
647  return false;
648  }
649  if (group_col_widths_ != other.group_col_widths_) {
650  return false;
651  }
653  return false;
654  }
656  return false;
657  }
658  if (min_val_ != other.min_val_) {
659  return false;
660  }
661  if (max_val_ != other.max_val_) {
662  return false;
663  }
664  if (bucket_ != other.bucket_) {
665  return false;
666  }
667  if (has_nulls_ != other.has_nulls_) {
668  return false;
669  }
670  if (count_distinct_descriptors_.size() != other.count_distinct_descriptors_.size()) {
671  return false;
672  } else {
673  // Count distinct descriptors can legitimately differ in device only.
674  for (size_t i = 0; i < count_distinct_descriptors_.size(); ++i) {
675  auto ref_count_distinct_desc = other.count_distinct_descriptors_[i];
676  auto count_distinct_desc = count_distinct_descriptors_[i];
677  count_distinct_desc.device_type = ref_count_distinct_desc.device_type;
678  if (ref_count_distinct_desc != count_distinct_desc) {
679  return false;
680  }
681  }
682  }
683  if (sort_on_gpu_ != other.sort_on_gpu_) {
684  return false;
685  }
686  if (output_columnar_ != other.output_columnar_) {
687  return false;
688  }
689  if (col_slot_context_ != other.col_slot_context_) {
690  return false;
691  }
693  return false;
694  }
695  return true;
696 }
697 
698 std::unique_ptr<QueryExecutionContext> QueryMemoryDescriptor::getQueryExecutionContext(
699  const RelAlgExecutionUnit& ra_exe_unit,
700  const Executor* executor,
701  const ExecutorDeviceType device_type,
702  const ExecutorDispatchMode dispatch_mode,
703  const int device_id,
704  const shared::TableKey& outer_table_key,
705  const int64_t num_rows,
706  const std::vector<std::vector<const int8_t*>>& col_buffers,
707  const std::vector<std::vector<uint64_t>>& frag_offsets,
708  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
709  const bool output_columnar,
710  const bool sort_on_gpu,
711  const size_t thread_idx,
712  RenderInfo* render_info) const {
713  auto timer = DEBUG_TIMER(__func__);
714  if (frag_offsets.empty()) {
715  return nullptr;
716  }
717  return std::unique_ptr<QueryExecutionContext>(
718  new QueryExecutionContext(ra_exe_unit,
719  *this,
720  executor,
721  device_type,
722  dispatch_mode,
723  device_id,
724  outer_table_key,
725  num_rows,
726  col_buffers,
727  frag_offsets,
728  row_set_mem_owner,
729  output_columnar,
730  sort_on_gpu,
731  thread_idx,
732  render_info));
733 }
734 
736  const RelAlgExecutionUnit& ra_exe_unit,
737  const std::vector<InputTableInfo>& query_infos,
738  const int8_t crt_min_byte_width) {
739  if (g_bigint_count) {
740  return sizeof(int64_t);
741  }
742  int8_t compact_width{0};
743  auto col_it = ra_exe_unit.input_col_descs.begin();
744  auto const end = ra_exe_unit.input_col_descs.end();
745  int unnest_array_col_id{std::numeric_limits<int>::min()};
746  for (const auto& groupby_expr : ra_exe_unit.groupby_exprs) {
747  const auto uoper = dynamic_cast<Analyzer::UOper*>(groupby_expr.get());
748  if (uoper && uoper->get_optype() == kUNNEST) {
749  const auto& arg_ti = uoper->get_operand()->get_type_info();
750  CHECK(arg_ti.is_array());
751  const auto& elem_ti = arg_ti.get_elem_type();
752  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
753  unnest_array_col_id = (*col_it)->getColId();
754  } else {
755  compact_width = crt_min_byte_width;
756  break;
757  }
758  }
759  if (col_it != end) {
760  ++col_it;
761  }
762  }
763  if (!compact_width &&
764  (ra_exe_unit.groupby_exprs.size() != 1 || !ra_exe_unit.groupby_exprs.front())) {
765  compact_width = crt_min_byte_width;
766  }
767  if (!compact_width) {
768  col_it = ra_exe_unit.input_col_descs.begin();
769  std::advance(col_it, ra_exe_unit.groupby_exprs.size());
770  for (const auto target : ra_exe_unit.target_exprs) {
771  const auto& ti = target->get_type_info();
772  const auto agg = dynamic_cast<const Analyzer::AggExpr*>(target);
773  if (agg && agg->get_arg()) {
774  compact_width = crt_min_byte_width;
775  break;
776  }
777 
778  if (agg) {
779  CHECK_EQ(kCOUNT, agg->get_aggtype());
780  CHECK(!agg->get_is_distinct());
781  if (col_it != end) {
782  ++col_it;
783  }
784  continue;
785  }
786 
787  if (is_int_and_no_bigger_than(ti, 4) ||
788  (ti.is_string() && ti.get_compression() == kENCODING_DICT)) {
789  if (col_it != end) {
790  ++col_it;
791  }
792  continue;
793  }
794 
795  const auto uoper = dynamic_cast<Analyzer::UOper*>(target);
796  if (uoper && uoper->get_optype() == kUNNEST &&
797  (*col_it)->getColId() == unnest_array_col_id) {
798  const auto arg_ti = uoper->get_operand()->get_type_info();
799  CHECK(arg_ti.is_array());
800  const auto& elem_ti = arg_ti.get_elem_type();
801  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
802  if (col_it != end) {
803  ++col_it;
804  }
805  continue;
806  }
807  }
808 
809  compact_width = crt_min_byte_width;
810  break;
811  }
812  }
813  if (!compact_width) {
814  size_t total_tuples{0};
815  for (const auto& qi : query_infos) {
816  total_tuples += qi.info.getNumTuples();
817  }
818  return total_tuples <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()) ||
819  unnest_array_col_id != std::numeric_limits<int>::min()
820  ? 4
821  : crt_min_byte_width;
822  } else {
823  // TODO(miyu): relax this condition to allow more cases just w/o padding
824  for (auto wid : get_col_byte_widths(ra_exe_unit.target_exprs)) {
825  compact_width = std::max(compact_width, wid);
826  }
827  return compact_width;
828  }
829 }
830 
833 }
834 
837  size_t total_bytes{0};
838  if (keyless_hash_) {
839  // ignore, there's no group column in the output buffer
841  } else {
842  total_bytes += group_col_widths_.size() * getEffectiveKeyWidth();
843  total_bytes = align_to_int64(total_bytes);
844  }
845  total_bytes += getColsSize();
846  return align_to_int64(total_bytes);
847 }
848 
850  return (interleaved_bins_on_gpu_ ? executor_->warpSize() : 1);
851 }
852 
855 }
856 
865 }
866 
872  const size_t num_entries_per_column) const {
873  return col_slot_context_.getTotalBytesOfColumnarBuffers(num_entries_per_column);
874 }
875 
886  const size_t projection_count) const {
887  constexpr size_t row_index_width = sizeof(int64_t);
888  return getTotalBytesOfColumnarBuffers(projection_count) +
889  row_index_width * projection_count;
890 }
891 
892 size_t QueryMemoryDescriptor::getColOnlyOffInBytes(const size_t col_idx) const {
893  return col_slot_context_.getColOnlyOffInBytes(col_idx);
894 }
895 
896 /*
897  * Returns the memory offset in bytes for a specific agg column in the output
898  * memory buffer. Depending on the query type, there may be some extra portion
899  * of memory prepended at the beginning of the buffer. A brief description of
900  * the memory layout is as follows:
901  * 1. projections: index column (64bit) + all target columns
902  * 2. group by: all group columns (64-bit each) + all agg columns
903  * 2a. if keyless, there is no prepending group column stored at the beginning
904  */
905 size_t QueryMemoryDescriptor::getColOffInBytes(const size_t col_idx) const {
906  const auto warp_count = getWarpCount();
907  if (output_columnar_) {
908  CHECK_EQ(size_t(1), warp_count);
909  size_t offset{0};
910  if (!keyless_hash_) {
912  }
914  for (size_t index = 0; index < col_idx; ++index) {
915  int8_t column_width = getPaddedSlotWidthBytes(index);
916  if (column_width > 0) {
917  offset += align_to_int64(column_width * entry_count_);
918  } else {
919  int64_t flatbuffer_size = getFlatBufferSize(index);
920  CHECK_GT(flatbuffer_size, 0);
921  offset += align_to_int64(flatbuffer_size);
922  }
923  }
924  } else {
925  for (size_t index = 0; index < col_idx; ++index) {
927  }
928  }
929  return offset;
930  }
931 
932  size_t offset{0};
933  if (keyless_hash_) {
934  // ignore, there's no group column in the output buffer
936  } else {
937  offset += group_col_widths_.size() * getEffectiveKeyWidth();
938  offset = align_to_int64(offset);
939  }
940  offset += getColOnlyOffInBytes(col_idx);
941  return offset;
942 }
943 
944 int64_t QueryMemoryDescriptor::getPaddedSlotBufferSize(const size_t slot_idx) const {
945  if (checkSlotUsesFlatBufferFormat(slot_idx)) {
946  return align_to_int64(getFlatBufferSize(slot_idx));
947  }
948  int8_t column_width = getPaddedSlotWidthBytes(slot_idx);
949  return align_to_int64(column_width * entry_count_);
950 }
951 
952 /*
953  * Returns the memory offset for a particular group column in the prepended group
954  * columns portion of the memory.
955  */
957  const size_t group_idx) const {
959  CHECK(group_idx < getGroupbyColCount());
960  size_t offset{0};
961  for (size_t col_idx = 0; col_idx < group_idx; col_idx++) {
962  // TODO(Saman): relax that int64_bit part immediately
963  offset += align_to_int64(
964  std::max(groupColWidth(col_idx), static_cast<int8_t>(sizeof(int64_t))) *
965  getEntryCount());
966  }
967  return offset;
968 }
969 
970 /*
971  * Returns total amount of memory prepended at the beginning of the output memory
972  * buffer.
973  */
976  size_t buffer_size{0};
977  for (size_t group_idx = 0; group_idx < getGroupbyColCount(); group_idx++) {
978  buffer_size += align_to_int64(
979  std::max(groupColWidth(group_idx), static_cast<int8_t>(sizeof(int64_t))) *
980  getEntryCount());
981  }
982  return buffer_size;
983 }
984 
985 size_t QueryMemoryDescriptor::getColOffInBytesInNextBin(const size_t col_idx) const {
986  auto warp_count = getWarpCount();
987  if (output_columnar_) {
988  CHECK_EQ(size_t(1), group_col_widths_.size());
989  CHECK_EQ(size_t(1), warp_count);
990  return getPaddedSlotWidthBytes(col_idx);
991  }
992 
993  return warp_count * getRowSize();
994 }
995 
996 size_t QueryMemoryDescriptor::getNextColOffInBytes(const int8_t* col_ptr,
997  const size_t bin,
998  const size_t col_idx) const {
1000  size_t offset{0};
1001  auto warp_count = getWarpCount();
1002  const auto chosen_bytes = getPaddedSlotWidthBytes(col_idx);
1003  const auto total_slot_count = getSlotCount();
1004  if (col_idx + 1 == total_slot_count) {
1005  if (output_columnar_) {
1006  return (entry_count_ - bin) * chosen_bytes;
1007  } else {
1008  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
1009  }
1010  }
1011 
1012  const auto next_chosen_bytes = getPaddedSlotWidthBytes(col_idx + 1);
1013  if (output_columnar_) {
1014  CHECK_EQ(size_t(1), group_col_widths_.size());
1015  CHECK_EQ(size_t(1), warp_count);
1016 
1017  offset = align_to_int64(entry_count_ * chosen_bytes);
1018 
1019  offset += bin * (next_chosen_bytes - chosen_bytes);
1020  return offset;
1021  }
1022 
1023  if (next_chosen_bytes == sizeof(int64_t)) {
1024  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
1025  } else {
1026  return chosen_bytes;
1027  }
1028 }
1029 
1031  const size_t col_idx) const {
1032  const auto chosen_bytes = getPaddedSlotWidthBytes(col_idx);
1033  const auto total_slot_count = getSlotCount();
1034  if (col_idx + 1 == total_slot_count) {
1035  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
1036  }
1037 
1038  const auto next_chosen_bytes = getPaddedSlotWidthBytes(col_idx + 1);
1039 
1040  if (next_chosen_bytes == sizeof(int64_t)) {
1041  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
1042  } else {
1043  return chosen_bytes;
1044  }
1045 }
1046 
1048  const RelAlgExecutionUnit& ra_exe_unit,
1049  const unsigned thread_count,
1050  const ExecutorDeviceType device_type) const {
1051  if (use_streaming_top_n_) {
1052  const size_t n =
1053  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
1054  return streaming_top_n::get_heap_size(getRowSize(), n, thread_count);
1055  }
1056  return getBufferSizeBytes(device_type, entry_count_);
1057 }
1058 
1072  const size_t entry_count) const {
1073  if (keyless_hash_ && !output_columnar_) {
1074  CHECK_GE(group_col_widths_.size(), size_t(1));
1075  auto row_bytes = align_to_int64(getColsSize());
1076  return (interleavedBins(device_type) ? executor_->warpSize() : 1) * entry_count *
1077  row_bytes;
1078  }
1079  constexpr size_t row_index_width = sizeof(int64_t);
1080  size_t total_bytes{0};
1081  if (output_columnar_) {
1082  switch (query_desc_type_) {
1084  total_bytes = row_index_width * entry_count + getTotalBytesOfColumnarBuffers();
1085  break;
1087  total_bytes = getTotalBytesOfColumnarBuffers();
1088  break;
1089  default:
1090  total_bytes = sizeof(int64_t) * group_col_widths_.size() * entry_count +
1092  break;
1093  }
1094  } else {
1095  total_bytes = getRowSize() * entry_count;
1096  }
1097  return total_bytes;
1098 }
1099 
1101  const ExecutorDeviceType device_type) const {
1102  return getBufferSizeBytes(device_type, entry_count_);
1103 }
1104 
1106  output_columnar_ = val;
1109  }
1110 }
1111 
1112 /*
1113  * Indicates the query types that are currently allowed to use the logical
1114  * sized columns instead of padded sized ones.
1115  */
1117  // In distributed mode, result sets are serialized using rowwise iterators, so we use
1118  // consistent slot widths for now
1119  return output_columnar_ && !g_cluster &&
1121  query_desc_type_ == QueryDescriptionType::TableFunction);
1122 }
1123 
1125  size_t total_slot_count = col_slot_context_.getSlotCount();
1126 
1127  if (target_groupby_indices_.empty()) {
1128  return total_slot_count;
1129  }
1130  return total_slot_count - std::count_if(target_groupby_indices_.begin(),
1132  [](const int64_t i) { return i >= 0; });
1133 }
1134 
1137  getGroupbyColCount() == 1);
1138 }
1139 
1142 }
1143 
1145  if (g_cluster) {
1146  return true;
1147  }
1149  return true;
1150  }
1151  if (executor_->isCPUOnly() || render_output_ ||
1156  getGroupbyColCount() > 1)) {
1157  return true;
1158  }
1161 }
1162 
1164  return device_type == ExecutorDeviceType::GPU && !render_output_ &&
1166 }
1167 
1169  return interleaved_bins_on_gpu_ && device_type == ExecutorDeviceType::GPU;
1170 }
1171 
1172 // TODO(Saman): an implementation detail, so move this out of QMD
1174  const ExecutorDeviceType device_type) const {
1175  if (device_type == ExecutorDeviceType::GPU) {
1176  return executor_->cudaMgr()->isArchVoltaOrGreaterForAll();
1177  }
1178  return false;
1179 }
1180 
1182  return col_slot_context_.getColCount();
1183 }
1184 
1187 }
1188 
1189 const int8_t QueryMemoryDescriptor::getPaddedSlotWidthBytes(const size_t slot_idx) const {
1190  return col_slot_context_.getSlotInfo(slot_idx).padded_size;
1191 }
1192 
1194  const int8_t bytes) {
1195  col_slot_context_.setPaddedSlotWidthBytes(slot_idx, bytes);
1196 }
1197 
1199  const size_t slot_idx) const {
1200  return col_slot_context_.getSlotInfo(slot_idx).logical_size;
1201 }
1202 
1204  const size_t col_idx) const {
1205  const auto& col_slots = col_slot_context_.getSlotsForCol(col_idx);
1206  CHECK_EQ(col_slots.size(), size_t(1));
1207  return col_slots.front();
1208 }
1209 
1210 void QueryMemoryDescriptor::useConsistentSlotWidthSize(const int8_t slot_width_size) {
1211  col_slot_context_.setAllSlotsSize(slot_width_size);
1212 }
1213 
1215  // Note: Actual row size may include padding (see ResultSetBufferAccessors.h)
1217 }
1218 
1220  const int8_t actual_min_byte_width) const {
1221  return col_slot_context_.getMinPaddedByteSize(actual_min_byte_width);
1222 }
1223 
1225  const std::vector<std::tuple<int8_t, int8_t>>& slots_for_col) {
1226  col_slot_context_.addColumn(slots_for_col);
1227 }
1228 
1229 void QueryMemoryDescriptor::addColSlotInfoFlatBuffer(const int64_t flatbuffer_size) {
1230  col_slot_context_.addColumnFlatBuffer(flatbuffer_size);
1231 }
1232 
1235 }
1236 
1239 }
1240 
1245 }
1246 
1248  switch (query_desc_type_) {
1250  return "Perfect Hash";
1252  return "Baseline Hash";
1254  return "Projection";
1256  return "Table Function";
1258  return "Non-grouped Aggregate";
1259  case QueryDescriptionType::Estimator:
1260  return "Estimator";
1261  default:
1262  UNREACHABLE();
1263  }
1264  return "";
1265 }
1266 
1267 std::string QueryMemoryDescriptor::toString() const {
1268  auto str = reductionKey();
1269  str += "\tAllow Multifrag: " + ::toString(allow_multifrag_) + "\n";
1270  str += "\tInterleaved Bins on GPU: " + ::toString(interleaved_bins_on_gpu_) + "\n";
1271  str += "\tBlocks Share Memory: " + ::toString(blocksShareMemory()) + "\n";
1272  str += "\tThreads Share Memory: " + ::toString(threadsShareMemory()) + "\n";
1273  str += "\tUses Fast Group Values: " + ::toString(usesGetGroupValueFast()) + "\n";
1274  str +=
1275  "\tLazy Init Groups (GPU): " + ::toString(lazyInitGroups(ExecutorDeviceType::GPU)) +
1276  "\n";
1277  str += "\tEntry Count: " + std::to_string(entry_count_) + "\n";
1278  str += "\tMin Val (perfect hash only): " + std::to_string(min_val_) + "\n";
1279  str += "\tMax Val (perfect hash only): " + std::to_string(max_val_) + "\n";
1280  str += "\tBucket Val (perfect hash only): " + std::to_string(bucket_) + "\n";
1281  str += "\tSort on GPU: " + ::toString(sort_on_gpu_) + "\n";
1282  str += "\tUse Streaming Top N: " + ::toString(use_streaming_top_n_) + "\n";
1283  str += "\tOutput Columnar: " + ::toString(output_columnar_) + "\n";
1284  auto const allow_lazy_fetch = executor_->plan_state_
1285  ? executor_->plan_state_->allow_lazy_fetch_
1287  str += "\tAllow Lazy Fetch: " + ::toString(allow_lazy_fetch) + "\n";
1288  str += "\tRender Output: " + ::toString(render_output_) + "\n";
1289  str += "\tUse Baseline Sort: " + ::toString(must_use_baseline_sort_) + "\n";
1290  return str;
1291 }
1292 
1294  std::string str;
1295  str += "Query Memory Descriptor State\n";
1296  str += "\tQuery Type: " + queryDescTypeToString() + "\n";
1297  str +=
1298  "\tKeyless Hash: " + ::toString(keyless_hash_) +
1299  (keyless_hash_ ? ", target index for key: " + std::to_string(getTargetIdxForKey())
1300  : "") +
1301  "\n";
1302  str += "\tEffective key width: " + std::to_string(getEffectiveKeyWidth()) + "\n";
1303  str += "\tNumber of group columns: " + std::to_string(getGroupbyColCount()) + "\n";
1304  const auto group_indices_size = targetGroupbyIndicesSize();
1305  if (group_indices_size) {
1306  std::vector<std::string> group_indices_strings;
1307  for (size_t target_idx = 0; target_idx < group_indices_size; ++target_idx) {
1308  group_indices_strings.push_back(std::to_string(getTargetGroupbyIndex(target_idx)));
1309  }
1310  str += "\tTarget group by indices: " +
1311  boost::algorithm::join(group_indices_strings, ",") + "\n";
1312  }
1313  str += "\t" + col_slot_context_.toString();
1314  return str;
1315 }
1316 
1317 std::vector<TargetInfo> target_exprs_to_infos(
1318  const std::vector<Analyzer::Expr*>& targets,
1320  std::vector<TargetInfo> target_infos;
1321  size_t index = 0;
1322  for (const auto target_expr : targets) {
1323  auto target = get_target_info(target_expr, g_bigint_count);
1324  if (query_mem_desc.getQueryDescriptionType() ==
1326  set_notnull(target, false);
1327  target.sql_type.set_notnull(false);
1328  }
1329  if (target.sql_type.supportsFlatBuffer()) {
1330  target.sql_type.setUsesFlatBuffer(
1331  query_mem_desc.checkSlotUsesFlatBufferFormat(index));
1332  }
1333  target_infos.push_back(target);
1334  index++;
1335  }
1336  return target_infos;
1337 }
1338 
1340  int64_t buffer_element_size{0};
1341  for (size_t i = 0; i < col_slot_context_.getSlotCount(); i++) {
1342  try {
1343  const auto slot_element_size = col_slot_context_.varlenOutputElementSize(i);
1344  if (slot_element_size < 0) {
1345  return std::nullopt;
1346  }
1347  buffer_element_size += slot_element_size;
1348  } catch (...) {
1349  continue;
1350  }
1351  }
1352  return buffer_element_size;
1353 }
1354 
1355 size_t QueryMemoryDescriptor::varlenOutputRowSizeToSlot(const size_t slot_idx) const {
1356  int64_t buffer_element_size{0};
1358  for (size_t i = 0; i < slot_idx; i++) {
1359  try {
1360  const auto slot_element_size = col_slot_context_.varlenOutputElementSize(i);
1361  if (slot_element_size < 0) {
1362  continue;
1363  }
1364  buffer_element_size += slot_element_size;
1365  } catch (...) {
1366  continue;
1367  }
1368  }
1369  return buffer_element_size;
1370 }
1371 
1373  const RelAlgExecutionUnit& ra_exe_unit) const {
1374  auto& pdc = ra_exe_unit.per_device_cardinality;
1375  auto by_cardinality = [](auto& a, auto& b) { return a.second < b.second; };
1376  auto itr = std::max_element(pdc.begin(), pdc.end(), by_cardinality);
1377  if (itr != pdc.end() && itr->second > 0) {
1378  return itr->second;
1379  }
1380  return std::nullopt;
1381 }
1382 
1384  const RelAlgExecutionUnit& ra_exe_unit) const {
1385  // union-query needs to consider the "SUM" of each subquery's result
1387  !ra_exe_unit.target_exprs_union.empty()) {
1388  return false;
1389  }
1390  auto is_left_join = [](auto& join_qual) { return join_qual.type == JoinType::LEFT; };
1391  auto& join_quals = ra_exe_unit.join_quals;
1392  return !std::any_of(join_quals.begin(), join_quals.end(), is_left_join);
1393 }
size_t varlenOutputRowSizeToSlot(const size_t slot_idx) const
GroupByPerfectHash
Definition: enums.h:58
int8_t getMinPaddedByteSize(const int8_t actual_min_byte_width) const
std::vector< Analyzer::Expr * > target_exprs
static bool many_entries(const int64_t max_val, const int64_t min_val, const int64_t bucket)
void addColSlotInfoFlatBuffer(const int64_t flatbuffer_size)
int64_t getIntMin() const
bool canUsePerDeviceCardinality(const RelAlgExecutionUnit &ra_exe_unit) const
SQLAgg
Definition: sqldefs.h:76
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool g_enable_smem_group_by
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
void alignPaddedSlots(const bool sort_on_gpu)
std::vector< int64_t > target_expr_proj_indices(const RelAlgExecutionUnit &ra_exe_unit)
int8_t logical_size
size_t getTotalBytesOfColumnarProjections(const size_t projection_count) const
NonGroupedAggregate
Definition: enums.h:58
int64_t getTargetGroupbyIndex(const size_t target_idx) const
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
std::string toString() const
bool g_enable_lazy_fetch
Definition: Execute.cpp:136
bool isLogicalSizedColumnsAllowed() const
void setPaddedSlotWidthBytes(const size_t slot_idx, const int8_t bytes)
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list)
int8_t pick_baseline_key_component_width(const ExpressionRange &range, const size_t group_col_width)
std::string join(T const &container, std::string const &delim)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const ApproxQuantileDescriptors &, const CountDistinctDescriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint, const bool threads_can_reuse_group_by_buffers)
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:338
void setOutputColumnar(const bool val)
#define CHECK_GE(x, y)
Definition: Logger.h:306
size_t getAllSlotsPaddedSize() const
size_t getAllSlotsAlignedPaddedSize() const
Projection
Definition: enums.h:58
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
std::vector< ApproxQuantileDescriptor > ApproxQuantileDescriptors
size_t g_streaming_topn_max
Definition: ResultSet.cpp:51
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
SortAlgorithm algorithm
#define CHECK_GT(x, y)
Definition: Logger.h:305
void setAllSlotsSize(const int8_t slot_width_size)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
std::string to_string(char const *&&v)
std::optional< size_t > getMaxPerDeviceCardinality(const RelAlgExecutionUnit &ra_exe_unit) const
void useConsistentSlotWidthSize(const int8_t slot_width_size)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
constexpr double a
Definition: Utm.h:32
size_t getColOnlyOffInBytes(const size_t col_idx) const
ExecutorDispatchMode
size_t getColOnlyOffInBytes(const size_t slot_idx) const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
TableFunction
Definition: enums.h:58
bool is_varlen_projection(const Analyzer::Expr *target_expr, const SQLTypeInfo &ti)
bool g_enable_columnar_output
Definition: Execute.cpp:106
int8_t groupColWidth(const size_t key_idx) const
std::vector< std::pair< std::vector< size_t >, size_t > > per_device_cardinality
size_t get_bit_width(const SQLTypeInfo &ti)
void addColumnFlatBuffer(const int64_t flatbuffer_size)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getCompactByteWidth() const
Provides column info and slot info for the output buffer and some metadata helpers.
size_t getGroupbyColCount() const
bool is_integer() const
Definition: sqltypes.h:567
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
const JoinQualsPerNestingLevel join_quals
bool lazyInitGroups(const ExecutorDeviceType) const
size_t targetGroupbyIndicesSize() const
std::optional< size_t > limit
size_t getPrependedGroupBufferSizeInBytes() const
std::list< Analyzer::OrderEntry > order_entries
size_t getTotalBytesOfColumnarBuffers() const
executor_(executor)
std::vector< int64_t > target_groupby_indices_
static int8_t pick_target_compact_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const int8_t crt_min_byte_width)
bool g_bigint_count
CountDistinctDescriptors count_distinct_descriptors_
int32_t get_varno() const
Definition: Analyzer.h:288
bool is_valid_int32_range(const ExpressionRange &range)
void validate() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
bool hasNulls() const
int64_t varlenOutputElementSize(const size_t slot_idx) const
int64_t getPaddedSlotBufferSize(const size_t slot_idx) const
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > target_expr_group_by_indices(const std::list< std::shared_ptr< Analyzer::Expr >> &groupby_exprs, const std::vector< Analyzer::Expr * > &target_exprs)
std::optional< size_t > varlenOutputBufferElemSize() const
void addColumn(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
const Expr * get_operand() const
Definition: Analyzer.h:384
QueryDescriptionType query_desc_type_
int8_t padded_size
Definition: sqldefs.h:81
int8_t updateActualMinByteWidth(const int8_t actual_min_byte_width) const
size_t getTotalBytesOfColumnarBuffers(const size_t entry_count) const
bool operator==(const QueryMemoryDescriptor &other) const
GroupByBaselineHash
Definition: enums.h:58
Descriptor for the result set buffer layout.
bool is_int_and_no_bigger_than(const SQLTypeInfo &ti, const size_t byte_width)
std::list< std::shared_ptr< Analyzer::Expr > > quals
ExpressionRangeType getType() const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
int64_t getIntMax() const
bool isWarpSyncRequired(const ExecutorDeviceType) const
std::string toString() const
size_t getSlotCount() const
void setAllSlotsPaddedSizeToLogicalSize()
bool interleavedBins(const ExecutorDeviceType) const
bool g_enable_watchdog false
Definition: Execute.cpp:80
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
size_t getColCount() const
std::vector< int8_t > group_col_widths_
#define EMPTY_KEY_32
bool g_cluster
void setPaddedSlotWidthBytes(const size_t slot_idx, const int8_t bytes)
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
const std::vector< size_t > & getSlotsForCol(const size_t col_idx) const
std::string queryDescTypeToString() const
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
constexpr double n
Definition: Utm.h:38
void addColSlotInfo(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)
void setAllUnsetSlotsPaddedSize(const int8_t padded_size)
int64_t getFlatBufferSize(const size_t slot_idx) const
int cpu_threads()
Definition: thread_count.h:25
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
bool checkSlotUsesFlatBufferFormat(const size_t slot_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
Definition: sqldefs.h:77
size_t getColOffInBytes(const size_t col_idx) const
size_t getColOffInBytesInNextBin(const size_t col_idx) const
std::unique_ptr< QueryExecutionContext > getQueryExecutionContext(const RelAlgExecutionUnit &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner >, const bool output_columnar, const bool sort_on_gpu, const size_t thread_idx, RenderInfo *) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int8_t pick_baseline_key_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Executor *executor)
std::string reductionKey() const
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
void set_notnull(TargetInfo &target, const bool not_null)
int32_t getTargetIdxForKey() const
size_t getPrependedGroupColOffInBytes(const size_t group_idx) const