OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern size_t g_approx_quantile_buffer;
58 extern size_t g_approx_quantile_centroids;
59 extern int64_t g_bitmap_memory_limit;
61 extern size_t g_leaf_count;
62 
63 bool ColRangeInfo::isEmpty() const {
64  return min == 0 && max == -1;
65 }
66 
67 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
68  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
69  << " max = " << info.max << " bucket = " << info.bucket
70  << " has_nulls = " << info.has_nulls << "\n";
71  return out;
72 }
73 
74 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
75  switch (type) {
77  out << "Invalid";
78  break;
80  out << "Bitmap";
81  break;
83  out << "UnorderedSet";
84  break;
85  default:
86  out << "<Unkown Type>";
87  break;
88  }
89  return out;
90 }
91 
92 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
93  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
94  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
95  << " bool approximate = " << desc.approximate
96  << " device_type = " << desc.device_type
97  << " sub_bitmap_count = " << desc.sub_bitmap_count;
98  return out;
99 }
100 
101 namespace {
102 
103 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
104  int32_t agg_count{0};
105  for (auto target_expr : target_exprs) {
106  CHECK(target_expr);
107  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
108  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
109  const auto& ti = target_expr->get_type_info();
110  if (ti.is_buffer()) {
111  agg_count += 2;
112  } else if (ti.is_geometry()) {
113  agg_count += ti.get_physical_coord_cols() * 2;
114  } else {
115  ++agg_count;
116  }
117  continue;
118  }
119  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
120  agg_count += 2;
121  } else {
122  ++agg_count;
123  }
124  }
125  return agg_count;
126 }
127 
128 bool expr_is_rowid(const Analyzer::Expr* expr) {
129  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
130  if (!col) {
131  return false;
132  }
133  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
134  if (!cd || !cd->isVirtualCol) {
135  return false;
136  }
137  CHECK_EQ("rowid", cd->columnName);
138  return true;
139 }
140 
141 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
142  for (const auto& target_expr : ra_exe_unit.target_exprs) {
143  const auto agg_info = get_target_info(target_expr, g_bigint_count);
144  if (agg_info.is_agg && is_distinct_target(agg_info)) {
145  return true;
146  }
147  }
148  return false;
149 }
150 
152  const int64_t max_entry_count) {
153  try {
154  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
155  checked_int64_t(col_range_info.min)) >= max_entry_count;
156  } catch (...) {
157  return true;
158  }
159 }
160 
161 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
162  const ColRangeInfo& col_range_info) {
163  try {
164  // the cardinality estimate is the size of the baseline hash table. further penalize
165  // the baseline hash table by a factor of 2x due to overhead in computing baseline
166  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
167  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
168  // count of the column, we use baseline hash on the filtered set
169  return checked_int64_t(cardinality_estimate) * 2 <
170  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
171  checked_int64_t(col_range_info.min));
172  } catch (...) {
173  return false;
174  }
175 }
176 
178  const std::vector<InputTableInfo>& query_infos,
179  const Analyzer::Expr* expr,
180  Executor* executor) {
181  if (!expr) {
182  return {QueryDescriptionType::Projection, 0, 0, 0, false};
183  }
184 
185  const auto expr_range = getExpressionRange(
186  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
187  switch (expr_range.getType()) {
189  if (expr_range.getIntMin() > expr_range.getIntMax()) {
190  return {
191  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
192  }
194  expr_range.getIntMin(),
195  expr_range.getIntMax(),
196  expr_range.getBucket(),
197  expr_range.hasNulls()};
198  }
201  if (expr_range.getFpMin() > expr_range.getFpMax()) {
202  return {
203  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
204  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  }
208  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
209  default:
210  CHECK(false);
211  }
212  CHECK(false);
213  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
214 }
215 
216 } // namespace
217 
219  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
220  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
221  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
222  // can expect this to be true anyway for grouped queries since the precise version
223  // uses significantly more memory.
224  const int64_t baseline_threshold =
226  // `group_cardinality_estimation_` is set as the result of (NDV) cardinality estimator
227  auto group_cardinality_estimation = group_cardinality_estimation_.value_or(0);
228  if (ra_exe_unit_.groupby_exprs.size() != 1) {
229  try {
230  checked_int64_t cardinality{1};
231  bool has_nulls{false};
232  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
233  auto col_range_info = get_expr_range_info(
234  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
235  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
236  // going through baseline hash if a non-integer type is encountered
238  0,
239  group_cardinality_estimation,
240  0,
241  false};
242  }
243  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
244  CHECK_GE(crt_col_cardinality, 0);
245  cardinality *= crt_col_cardinality;
246  if (col_range_info.has_nulls) {
247  has_nulls = true;
248  }
249  }
250  // For zero or high cardinalities, use baseline layout.
251  if (!cardinality || cardinality > baseline_threshold) {
253  0,
254  group_cardinality_estimation,
255  0,
256  false};
257  }
258  // todo (yoonmin) : should we consider min(group_cardinality_estimation,
259  // cardinality) if we have `group_cardinality_estimation` value?
261  0,
262  int64_t(cardinality),
263  0,
264  has_nulls};
265  } catch (...) { // overflow when computing cardinality
267  0,
268  group_cardinality_estimation,
269  0,
270  false};
271  }
272  }
273  // For single column groupby on high timestamps, force baseline hash due to wide ranges
274  // we are likely to encounter when applying quals to the expression range
275  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
276  // the range is small enough
277  if (ra_exe_unit_.groupby_exprs.front() &&
278  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
279  ra_exe_unit_.simple_quals.size() > 0) {
281  0,
282  group_cardinality_estimation,
283  0,
284  false};
285  }
286  const auto col_range_info = get_expr_range_info(
288  if (!ra_exe_unit_.groupby_exprs.front()) {
289  return col_range_info;
290  }
291  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
292  const int64_t col_count =
294  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
296  max_entry_count = std::min(max_entry_count, baseline_threshold);
297  }
298  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
299  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
300  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
301 
302  const bool has_filters =
303  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
304  if (has_filters &&
305  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
306  // if filters are present, we can use the filter to narrow the cardinality of the
307  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
308  // off attempting perfect hash (since we know the range will be made of
309  // monotonically increasing numbers from min to max for dictionary encoded strings)
310  // and failing later due to excessive memory use.
311  // Check the conditions where baseline hash can provide a performance increase and
312  // return baseline hash (potentially forcing an estimator query) as the range type.
313  // Otherwise, return col_range_info which will likely be perfect hash, though could
314  // be baseline from a previous call of this function prior to the estimator query.
315  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
316  // TODO(adb): allow some sorts to pass through this block by centralizing sort
317  // algorithm decision making
319  // always use baseline hash for column range too big for perfect hash with count
320  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
321  // hash group by in this case.
323  col_range_info.min,
324  col_range_info.max,
325  0,
326  col_range_info.has_nulls};
327  } else {
328  // use original col range for sort
329  return col_range_info;
330  }
331  }
332  // if filters are present and the filtered range is less than the cardinality of
333  // the column, consider baseline hash
336  col_range_info)) {
338  col_range_info.min,
339  col_range_info.max,
340  0,
341  col_range_info.has_nulls};
342  }
343  }
344  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get())) &&
345  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
346  !col_range_info.bucket) {
348  col_range_info.min,
349  col_range_info.max,
350  0,
351  col_range_info.has_nulls};
352  }
353  return col_range_info;
354 }
355 
357  checked_int64_t crt_col_cardinality =
358  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
359  if (col_range_info.bucket) {
360  crt_col_cardinality /= col_range_info.bucket;
361  }
362  return static_cast<int64_t>(crt_col_cardinality +
363  (1 + (col_range_info.has_nulls ? 1 : 0)));
364 }
365 
366 namespace {
367 // Like getBucketedCardinality() without counting nulls.
368 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
369  if (col_range_info.min <= col_range_info.max) {
370  size_t size = col_range_info.max - col_range_info.min;
371  if (col_range_info.bucket) {
372  size /= col_range_info.bucket;
373  }
374  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
375  // try to use unordered_set instead of crashing due to CHECK failure
376  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
377  return 0;
378  }
379  return static_cast<int64_t>(size + 1);
380  } else {
381  return 0;
382  }
383 }
384 } // namespace
385 
386 #define LL_CONTEXT executor_->cgen_state_->context_
387 #define LL_BUILDER executor_->cgen_state_->ir_builder_
388 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
389 #define LL_INT(v) executor_->cgen_state_->llInt(v)
390 #define LL_FP(v) executor_->cgen_state_->llFp(v)
391 #define ROW_FUNC executor_->cgen_state_->row_func_
392 #define CUR_FUNC executor_->cgen_state_->current_func_
393 
395  Executor* executor,
396  const ExecutorDeviceType device_type,
397  const RelAlgExecutionUnit& ra_exe_unit,
398  const std::vector<InputTableInfo>& query_infos,
399  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
400  const std::optional<int64_t>& group_cardinality_estimation)
401  : executor_(executor)
402  , ra_exe_unit_(ra_exe_unit)
403  , query_infos_(query_infos)
404  , row_set_mem_owner_(row_set_mem_owner)
405  , device_type_(device_type)
406  , group_cardinality_estimation_(group_cardinality_estimation) {
407  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
408  if (!groupby_expr) {
409  continue;
410  }
411  const auto& groupby_ti = groupby_expr->get_type_info();
412  if (groupby_ti.is_text_encoding_none()) {
413  throw std::runtime_error(
414  "Cannot group by string columns which are not dictionary encoded.");
415  }
416  if (groupby_ti.is_buffer()) {
417  throw std::runtime_error("Group by buffer not supported");
418  }
419  if (groupby_ti.is_geometry()) {
420  throw std::runtime_error("Group by geometry not supported");
421  }
422  }
423 }
424 
426  const size_t shard_count) const {
427  size_t device_count{0};
429  device_count = executor_->cudaMgr()->getDeviceCount();
430  CHECK_GT(device_count, 0u);
431  }
432 
433  int64_t bucket{col_range_info.bucket};
434 
435  if (shard_count) {
436  CHECK(!col_range_info.bucket);
437  /*
438  when a node has fewer devices than shard count,
439  a) In a distributed setup, the minimum distance between two keys would be
440  device_count because shards are stored consecutively across the physical tables,
441  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
442  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
443  node has only 1 device, in this case, all the keys from each node are loaded on
444  the device each.
445 
446  b) In a single node setup, the distance would be minimum of device_count or
447  difference of device_count - shard_count. For example: If a single node server
448  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
449  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
450  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
451  of device_count or difference.
452 
453  When a node has device count equal to or more than shard count then the
454  minimum distance is always at least shard_count * no of leaf nodes.
455  */
456  if (device_count < shard_count) {
457  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
458  : std::min(device_count, shard_count - device_count);
459  } else {
460  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
461  }
462  }
463 
464  return bucket;
465 }
466 
467 namespace {
468 
479  const std::vector<InputTableInfo>& query_infos,
480  const bool is_group_by,
481  Executor* executor) {
482  bool keyless{true}, found{false};
483  int32_t num_agg_expr{0};
484  int32_t index{0};
485  for (const auto target_expr : ra_exe_unit.target_exprs) {
486  const auto agg_info = get_target_info(target_expr, g_bigint_count);
487  const auto chosen_type = get_compact_type(agg_info);
488  if (agg_info.is_agg) {
489  num_agg_expr++;
490  }
491  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
492  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
493  CHECK(agg_expr);
494  const auto arg_expr = agg_arg(target_expr);
495  const bool float_argument_input = takes_float_argument(agg_info);
496  switch (agg_info.agg_kind) {
497  case kAVG:
498  ++index;
499  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
500  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
501  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
502  expr_range_info.hasNulls()) {
503  break;
504  }
505  }
506  found = true;
507  break;
508  case kCOUNT:
509  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
510  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
511  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
512  expr_range_info.hasNulls()) {
513  break;
514  }
515  }
516  found = true;
517  break;
518  case kSUM: {
519  auto arg_ti = arg_expr->get_type_info();
520  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
521  arg_ti.set_notnull(true);
522  }
523  if (!arg_ti.get_notnull()) {
524  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
525  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
526  !expr_range_info.hasNulls()) {
527  found = true;
528  }
529  } else {
530  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
531  switch (expr_range_info.getType()) {
534  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
535  found = true;
536  }
537  break;
539  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
540  found = true;
541  }
542  break;
543  default:
544  break;
545  }
546  }
547  break;
548  }
549  case kMIN: {
550  CHECK(agg_expr && agg_expr->get_arg());
551  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
552  if (arg_ti.is_string() || arg_ti.is_buffer()) {
553  break;
554  }
555  auto expr_range_info =
556  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
557  auto init_max = get_agg_initial_val(agg_info.agg_kind,
558  chosen_type,
559  is_group_by || float_argument_input,
560  float_argument_input ? sizeof(float) : 8);
561  switch (expr_range_info.getType()) {
564  auto double_max =
565  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
566  if (expr_range_info.getFpMax() < double_max) {
567  found = true;
568  }
569  break;
570  }
572  if (expr_range_info.getIntMax() < init_max) {
573  found = true;
574  }
575  break;
576  default:
577  break;
578  }
579  break;
580  }
581  case kMAX: {
582  CHECK(agg_expr && agg_expr->get_arg());
583  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
584  if (arg_ti.is_string() || arg_ti.is_buffer()) {
585  break;
586  }
587  auto expr_range_info =
588  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
589  // NULL sentinel and init value for kMAX are identical, which results in
590  // ambiguity in detecting empty keys in presence of nulls.
591  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
592  expr_range_info.hasNulls()) {
593  break;
594  }
595  auto init_min = get_agg_initial_val(agg_info.agg_kind,
596  chosen_type,
597  is_group_by || float_argument_input,
598  float_argument_input ? sizeof(float) : 8);
599  switch (expr_range_info.getType()) {
602  auto double_min =
603  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
604  if (expr_range_info.getFpMin() > double_min) {
605  found = true;
606  }
607  break;
608  }
610  if (expr_range_info.getIntMin() > init_min) {
611  found = true;
612  }
613  break;
614  default:
615  break;
616  }
617  break;
618  }
619  default:
620  keyless = false;
621  break;
622  }
623  }
624  if (!keyless) {
625  break;
626  }
627  if (!found) {
628  ++index;
629  }
630  }
631 
632  // shouldn't use keyless for projection only
633  return {
634  keyless && found,
635  index,
636  };
637 }
638 
640  const RelAlgExecutionUnit& ra_exe_unit,
641  const std::vector<InputTableInfo>& query_infos,
642  const ColRangeInfo& group_by_range_info,
643  const ExecutorDeviceType device_type,
644  Executor* executor) {
645  CountDistinctDescriptors count_distinct_descriptors;
646  auto compute_bytes_per_group =
647  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
648  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
649  const auto padded_size =
650  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
651  ? align_to_int64(effective_size_bytes)
652  : effective_size_bytes;
653  return padded_size * sub_bitmap_count;
654  };
655  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
656  const auto target_expr = ra_exe_unit.target_exprs[i];
657  auto agg_info = get_target_info(target_expr, g_bigint_count);
658  if (is_distinct_target(agg_info)) {
659  CHECK(agg_info.is_agg);
660  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
661  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
662  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
663  if (arg_ti.is_text_encoding_none()) {
664  throw std::runtime_error(
665  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
666  }
667  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
668  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
669  }
670  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
671  throw std::runtime_error(
672  "APPROX_COUNT_DISTINCT on geometry columns not supported");
673  }
674  if (agg_info.is_distinct && arg_ti.is_geometry()) {
675  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
676  }
677  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
678  auto arg_range_info =
679  arg_ti.is_fp() ? no_range_info
681  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
682  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
683  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
684  const auto& original_target_expr_ti = it->second;
685  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
686  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
687  // manually encode the col range of date col if necessary
688  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
689  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
690  if (original_target_expr_ti.get_comp_param() == 16) {
691  return date_val < INT16_MIN || date_val > INT16_MAX;
692  } else {
693  return date_val < INT32_MIN || date_val > INT32_MIN;
694  }
695  };
696  if (is_date_value_not_encoded(arg_range_info.min)) {
697  // chunk metadata of the date column contains decoded value
698  // so we manually encode it again here to represent its column range correctly
699  arg_range_info.min =
701  }
702  if (is_date_value_not_encoded(arg_range_info.max)) {
703  arg_range_info.max =
705  }
706  // now we manually encode the value, so we need to invalidate bucket value
707  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
708  arg_range_info.bucket = 0;
709  }
710  }
711 
713  int64_t bitmap_sz_bits{0};
714  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
715  const auto error_rate_expr = agg_expr->get_arg1();
716  if (error_rate_expr) {
717  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
718  auto const error_rate =
719  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
720  CHECK(error_rate);
721  CHECK_GE(error_rate->get_constval().intval, 1);
722  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
723  } else {
724  bitmap_sz_bits = g_hll_precision_bits;
725  }
726  }
727  if (arg_range_info.isEmpty()) {
728  count_distinct_descriptors.emplace_back(
730  0,
731  arg_range_info.bucket,
732  64,
733  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
734  device_type,
735  1});
736  continue;
737  }
738  const auto sub_bitmap_count =
739  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
740  size_t worst_case_num_groups{1};
741  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
742  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
743  // implementation for arrays
744  count_distinct_impl_type = CountDistinctImplType::Bitmap;
745  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
746  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
747  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
748  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
749  }
750  // check a potential OOM when using bitmap-based approach
751  const auto total_bytes_per_entry =
752  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
753  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
754  const auto maximum_num_groups =
755  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
756  const auto total_bitmap_bytes_for_groups =
757  total_bytes_per_entry * maximum_num_groups;
758  // we can estimate a potential OOM of bitmap-based count-distinct operator
759  // by using the logic "check_total_bitmap_memory"
760  if (total_bitmap_bytes_for_groups >=
761  static_cast<size_t>(g_bitmap_memory_limit)) {
762  const auto agg_expr_max_entry_count =
763  arg_range_info.max - arg_range_info.min + 1;
764  int64_t max_agg_expr_table_cardinality{1};
765  std::set<const Analyzer::ColumnVar*,
766  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
768  agg_expr->collect_column_var(colvar_set, true);
769  for (const auto cv : colvar_set) {
770  auto it =
771  std::find_if(query_infos.begin(),
772  query_infos.end(),
773  [&](const auto& input_table_info) {
774  return input_table_info.table_key == cv->getTableKey();
775  });
776  int64_t cur_table_cardinality =
777  it != query_infos.end()
778  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
779  : -1;
780  max_agg_expr_table_cardinality =
781  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
782  worst_case_num_groups *= cur_table_cardinality;
783  }
784  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
785  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
786  };
787  // if we have valid stats regarding input expr, we can try to relax the OOM
788  if (has_valid_stat()) {
789  // a threshold related to a ratio of a range of agg expr (let's say R)
790  // and table cardinality (C), i.e., use unordered_set if the # bits to build
791  // a bitmap based on R is four times larger than that of C
792  const size_t unordered_set_threshold{2};
793  // When we detect OOM of bitmap-based approach we selectively switch it to
794  // hash set-based processing logic if one of the followings is satisfied:
795  // 1) the column range is too wide compared with the table cardinality, or
796  // 2) the column range is too wide compared with the avg of # unique values
797  // per group by entry
798  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
799  const auto bits_for_agg_table =
800  std::ceil(log(max_agg_expr_table_cardinality));
801  const auto avg_num_unique_entries_per_group =
802  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
803  // case a) given a range of entry count of agg_expr and the maximum
804  // cardinality among source tables of the agg_expr , we try to detect the
805  // misleading case of too sparse column range , i.e., agg_expr has 1M column
806  // range but only has two tuples {1 and 1M} / case b) check whether
807  // using bitmap is really beneficial when considering uniform distribution
808  // of (unique) keys.
809  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
810  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
811  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
812  } else {
813  throw std::runtime_error(
814  "Consider using approx_count_distinct operator instead of "
815  "count_distinct operator to lower the memory "
816  "requirements");
817  }
818  }
819  }
820  }
821  }
822  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
823  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
824  !(arg_ti.is_array() || arg_ti.is_geometry())) {
825  count_distinct_impl_type = CountDistinctImplType::Bitmap;
826  }
827  const size_t too_many_entries{100000000};
828  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
829  worst_case_num_groups > too_many_entries &&
830  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
831  throw WatchdogException(
832  "Detect too many input entries for set-based count distinct operator under "
833  "the watchdog");
834  }
835  count_distinct_descriptors.emplace_back(
836  CountDistinctDescriptor{count_distinct_impl_type,
837  arg_range_info.min,
838  arg_range_info.bucket,
839  bitmap_sz_bits,
840  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
841  device_type,
842  sub_bitmap_count});
843  } else {
844  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
845  CountDistinctImplType::Invalid, 0, 0, 0, false, device_type, 0});
846  }
847  }
848  return count_distinct_descriptors;
849 }
850 
851 } // namespace
852 
853 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
854  const bool allow_multifrag,
855  const size_t max_groups_buffer_entry_count,
856  const int8_t crt_min_byte_width,
857  RenderInfo* render_info,
858  const bool output_columnar_hint) {
859  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
861  : 0;
862  bool sort_on_gpu_hint =
863  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
866  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
867  // but the total output buffer size would be too big or it's a sharded top query.
868  // For the sake of managing risk, use the new result set way very selectively for
869  // this case only (alongside the baseline layout we've enabled for a while now).
870  bool must_use_baseline_sort = shard_count;
871  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
872  while (true) {
873  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
874  max_groups_buffer_entry_count,
875  crt_min_byte_width,
876  sort_on_gpu_hint,
877  render_info,
878  must_use_baseline_sort,
879  output_columnar_hint);
880  CHECK(query_mem_desc);
881  if (query_mem_desc->sortOnGpu() &&
882  (query_mem_desc->getBufferSizeBytes(device_type_) +
883  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
884  2 * 1024 * 1024 * 1024LL) {
885  must_use_baseline_sort = true;
886  sort_on_gpu_hint = false;
887  } else {
888  break;
889  }
890  }
891  return query_mem_desc;
892 }
893 
895  // Count APPROX_QUANTILE targets
896  size_t target_count = 0u;
897  auto count_target = [&](Analyzer::AggExpr const*, size_t) { ++target_count; };
899  if (target_count == 0u) {
900  return {};
901  }
902 
903  // Reserve and fill descriptors
904  std::vector<ApproxQuantileDescriptor> descriptors;
905  descriptors.reserve(target_count);
906  auto add_descriptor = [&](Analyzer::AggExpr const*, size_t) {
907  descriptors.push_back({g_approx_quantile_buffer, g_approx_quantile_centroids});
908  };
910  return descriptors;
911 }
912 
913 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
914  const bool allow_multifrag,
915  const size_t max_groups_buffer_entry_count,
916  const int8_t crt_min_byte_width,
917  const bool sort_on_gpu_hint,
918  RenderInfo* render_info,
919  const bool must_use_baseline_sort,
920  const bool output_columnar_hint) {
921  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
922 
923  const bool threads_can_reuse_group_by_buffers =
924  device_type_ == ExecutorDeviceType::CPU && is_group_by &&
925  ra_exe_unit_.groupby_exprs.front();
926 
927  auto col_range_info_nosharding = getColRangeInfo();
928 
929  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
931  : 0;
932 
933  const auto col_range_info =
934  ColRangeInfo{col_range_info_nosharding.hash_type_,
935  col_range_info_nosharding.min,
936  col_range_info_nosharding.max,
937  getShardedTopBucket(col_range_info_nosharding, shard_count),
938  col_range_info_nosharding.has_nulls};
939 
940  // Non-grouped aggregates do not support accessing aggregated ranges
941  // Keyless hash is currently only supported with single-column perfect hash
942  const auto keyless_info =
943  !(is_group_by &&
944  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
945  ? KeylessInfo{false, -1}
947 
948  if (g_enable_watchdog &&
949  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
950  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
951  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
952  ra_exe_unit_.groupby_exprs.size() == 1 &&
953  (col_range_info.max - col_range_info.min) /
954  std::max(col_range_info.bucket, int64_t(1)) >
955  130000000))) {
956  throw WatchdogException("Query would use too much memory");
957  }
958 
959  const auto count_distinct_descriptors = init_count_distinct_descriptors(
960  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
961  auto approx_quantile_descriptors = initApproxQuantileDescriptors();
962  try {
964  ra_exe_unit_,
965  query_infos_,
966  col_range_info,
967  keyless_info,
968  allow_multifrag,
969  device_type_,
970  crt_min_byte_width,
971  sort_on_gpu_hint,
972  shard_count,
973  max_groups_buffer_entry_count,
974  render_info,
975  approx_quantile_descriptors,
976  count_distinct_descriptors,
977  must_use_baseline_sort,
978  output_columnar_hint,
979  /*streaming_top_n_hint=*/true,
980  threads_can_reuse_group_by_buffers);
981  } catch (const StreamingTopNOOM& e) {
982  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
984  ra_exe_unit_,
985  query_infos_,
986  col_range_info,
987  keyless_info,
988  allow_multifrag,
989  device_type_,
990  crt_min_byte_width,
991  sort_on_gpu_hint,
992  shard_count,
993  max_groups_buffer_entry_count,
994  render_info,
995  approx_quantile_descriptors,
996  count_distinct_descriptors,
997  must_use_baseline_sort,
998  output_columnar_hint,
999  /*streaming_top_n_hint=*/false,
1000  threads_can_reuse_group_by_buffers);
1001  }
1002 }
1003 
1005  const std::list<Analyzer::OrderEntry>& order_entries) {
1006  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
1007  return false;
1008  }
1009  for (const auto& order_entry : order_entries) {
1010  CHECK_GE(order_entry.tle_no, 1);
1011  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
1012  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
1013  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
1014  return false;
1015  }
1016  // TODO(alex): relax the restrictions
1017  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
1018  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
1019  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
1020  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
1021  return false;
1022  }
1023  if (agg_expr->get_arg()) {
1024  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
1025  if (arg_ti.is_fp()) {
1026  return false;
1027  }
1028  auto expr_range_info =
1029  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
1030  // TOD(adb): QMD not actually initialized here?
1031  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
1032  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
1033  expr_range_info.has_nulls) &&
1034  order_entry.is_desc == order_entry.nulls_first) {
1035  return false;
1036  }
1037  }
1038  const auto& target_ti = target_expr->get_type_info();
1039  CHECK(!target_ti.is_buffer());
1040  if (!target_ti.is_integer()) {
1041  return false;
1042  }
1043  }
1044  return true;
1045 }
1046 
1047 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
1048  llvm::BasicBlock* sc_false,
1050  const CompilationOptions& co,
1051  const GpuSharedMemoryContext& gpu_smem_context) {
1052  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1053  CHECK(filter_result);
1054 
1055  bool can_return_error = false;
1056  llvm::BasicBlock* filter_false{nullptr};
1057 
1058  {
1059  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
1060 
1061  if (executor_->isArchMaxwell(co.device_type)) {
1062  prependForceSync();
1063  }
1064  DiamondCodegen filter_cfg(filter_result,
1065  executor_,
1066  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1067  "filter", // filter_true and filter_false basic blocks
1068  nullptr,
1069  false);
1070  filter_false = filter_cfg.cond_false_;
1071 
1072  if (is_group_by) {
1074  !query_mem_desc.useStreamingTopN()) {
1075  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1076  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1077  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1078  llvm::Value* old_total_matched_val{nullptr};
1080  old_total_matched_val =
1081  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1082  total_matched_ptr,
1083  LL_INT(int32_t(1)),
1084 #if LLVM_VERSION_MAJOR > 12
1085  LLVM_ALIGN(8),
1086 #endif
1087  llvm::AtomicOrdering::Monotonic);
1088  } else {
1089  old_total_matched_val = LL_BUILDER.CreateLoad(
1090  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1091  LL_BUILDER.CreateStore(
1092  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1093  total_matched_ptr);
1094  }
1095  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1096  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1097  }
1098 
1099  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1100  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1101  if (query_mem_desc.usesGetGroupValueFast() ||
1102  query_mem_desc.getQueryDescriptionType() ==
1104  if (query_mem_desc.getGroupbyColCount() > 1) {
1105  filter_cfg.setChainToNext();
1106  }
1107  // Don't generate null checks if the group slot is guaranteed to be non-null,
1108  // as it's the case for get_group_value_fast* family.
1109  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1110  varlen_output_buffer,
1111  {},
1113  co,
1114  gpu_smem_context,
1115  filter_cfg);
1116  } else {
1117  {
1118  llvm::Value* nullcheck_cond{nullptr};
1119  if (query_mem_desc.didOutputColumnar()) {
1120  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1121  LL_INT(int32_t(0)));
1122  } else {
1123  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1124  std::get<0>(agg_out_ptr_w_idx),
1125  llvm::ConstantPointerNull::get(
1126  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1127  }
1128  DiamondCodegen nullcheck_cfg(
1129  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1130  codegenAggCalls(agg_out_ptr_w_idx,
1131  varlen_output_buffer,
1132  {},
1134  co,
1135  gpu_smem_context,
1136  filter_cfg);
1137  }
1138  can_return_error = true;
1139  if (query_mem_desc.getQueryDescriptionType() ==
1141  query_mem_desc.useStreamingTopN()) {
1142  // Ignore rejection on pushing current row to top-K heap.
1143  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1144  } else {
1145  CodeGenerator code_generator(executor_);
1146  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1147  // TODO(alex): remove the trunc once pos is converted to 32 bits
1148  code_generator.posArg(nullptr),
1149  get_int_type(32, LL_CONTEXT))));
1150  }
1151  }
1152  } else {
1153  if (ra_exe_unit_.estimator) {
1154  std::stack<llvm::BasicBlock*> array_loops;
1155  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1156  } else {
1157  auto arg_it = ROW_FUNC->arg_begin();
1158  std::vector<llvm::Value*> agg_out_vec;
1159  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1160  agg_out_vec.push_back(&*arg_it++);
1161  }
1162  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1163  /*varlen_output_buffer=*/nullptr,
1164  agg_out_vec,
1165  query_mem_desc,
1166  co,
1167  gpu_smem_context,
1168  filter_cfg);
1169  }
1170  }
1171  }
1172 
1173  if (ra_exe_unit_.join_quals.empty()) {
1174  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1175  } else if (sc_false) {
1176  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1177  LL_BUILDER.SetInsertPoint(sc_false);
1178  LL_BUILDER.CreateBr(filter_false);
1179  LL_BUILDER.SetInsertPoint(saved_insert_block);
1180  }
1181 
1182  return can_return_error;
1183 }
1184 
1186  llvm::Value* groups_buffer,
1188  const CompilationOptions& co,
1189  DiamondCodegen& diamond_codegen) {
1190  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1192  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1193  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1194  CHECK(!group_expr);
1195  if (!query_mem_desc.didOutputColumnar()) {
1196  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1197  }
1198  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1199  ? 0
1200  : query_mem_desc.getRowSize() / sizeof(int64_t);
1201  CodeGenerator code_generator(executor_);
1202  if (query_mem_desc.useStreamingTopN()) {
1203  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1204  CHECK_GE(only_order_entry.tle_no, int(1));
1205  const size_t target_idx = only_order_entry.tle_no - 1;
1206  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1207  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1208  const auto chosen_bytes =
1209  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1210  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1211  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1212  const uint32_t n =
1214  std::string fname = "get_bin_from_k_heap";
1215  const auto& oe_ti = order_entry_expr->get_type_info();
1216  llvm::Value* null_key_lv = nullptr;
1217  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1218  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1219  switch (bit_width) {
1220  case 32:
1221  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1222  break;
1223  case 64:
1224  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1225  break;
1226  default:
1227  CHECK(false);
1228  }
1229  fname += "_int" + std::to_string(bit_width) + "_t";
1230  } else {
1231  CHECK(oe_ti.is_fp());
1232  if (order_entry_lv->getType()->isDoubleTy()) {
1233  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1234  } else {
1235  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1236  }
1237  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1238  }
1239  const auto key_slot_idx =
1241  return emitCall(
1242  fname,
1243  {groups_buffer,
1244  LL_INT(n),
1245  LL_INT(row_size_quad),
1246  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1247  LL_BOOL(only_order_entry.is_desc),
1248  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1249  LL_BOOL(only_order_entry.nulls_first),
1250  null_key_lv,
1251  order_entry_lv});
1252  } else {
1253  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1254  const auto output_buffer_entry_count_lv =
1255  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1256  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1257  const auto group_expr_lv =
1258  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1259  std::vector<llvm::Value*> args{groups_buffer,
1260  output_buffer_entry_count_lv,
1261  group_expr_lv,
1262  code_generator.posArg(nullptr)};
1263  if (query_mem_desc.didOutputColumnar()) {
1264  const auto columnar_output_offset =
1265  emitCall("get_columnar_scan_output_offset", args);
1266  return columnar_output_offset;
1267  }
1268  args.push_back(LL_INT(row_size_quad));
1269  return emitCall("get_scan_output_slot", args);
1270  }
1271 }
1272 
1273 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1275  const CompilationOptions& co,
1276  DiamondCodegen& diamond_codegen) {
1277  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1278  auto arg_it = ROW_FUNC->arg_begin();
1279  auto groups_buffer = arg_it++;
1280 
1281  std::stack<llvm::BasicBlock*> array_loops;
1282 
1283  // TODO(Saman): move this logic outside of this function.
1285  if (query_mem_desc.didOutputColumnar()) {
1286  return std::make_tuple(
1287  &*groups_buffer,
1288  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1289  } else {
1290  return std::make_tuple(
1291  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1292  nullptr);
1293  }
1294  }
1295 
1296  CHECK(query_mem_desc.getQueryDescriptionType() ==
1298  query_mem_desc.getQueryDescriptionType() ==
1300 
1301  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1302  ? 0
1303  : query_mem_desc.getRowSize() / sizeof(int64_t);
1304 
1305  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1306  ? sizeof(int64_t)
1307  : query_mem_desc.getEffectiveKeyWidth();
1308  // for multi-column group by
1309  llvm::Value* group_key = nullptr;
1310  llvm::Value* key_size_lv = nullptr;
1311 
1312  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1313  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1314  if (query_mem_desc.getQueryDescriptionType() ==
1316  group_key =
1317  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1318  } else if (query_mem_desc.getQueryDescriptionType() ==
1320  group_key =
1321  col_width_size == sizeof(int32_t)
1322  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1323  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1324  }
1325  CHECK(group_key);
1326  CHECK(key_size_lv);
1327  }
1328 
1329  int32_t subkey_idx = 0;
1330  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1331  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1332  const auto col_range_info =
1334  const auto translated_null_value = static_cast<int64_t>(
1335  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1336  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1337  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1338  : checked_int64_t(col_range_info.max) +
1339  (col_range_info.bucket ? col_range_info.bucket : 1));
1340 
1341  const bool col_has_nulls =
1342  query_mem_desc.getQueryDescriptionType() ==
1344  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1345  ? query_mem_desc.hasNulls()
1346  : col_range_info.has_nulls)
1347  : false;
1348 
1349  const auto group_expr_lvs =
1350  executor_->groupByColumnCodegen(group_expr.get(),
1351  col_width_size,
1352  co,
1353  col_has_nulls,
1354  translated_null_value,
1355  diamond_codegen,
1356  array_loops,
1357  query_mem_desc.threadsShareMemory());
1358  const auto group_expr_lv = group_expr_lvs.translated_value;
1359  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1360  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1361  return codegenSingleColumnPerfectHash(query_mem_desc,
1362  co,
1363  &*groups_buffer,
1364  group_expr_lv,
1365  group_expr_lvs.original_value,
1366  row_size_quad);
1367  } else {
1368  // store the sub-key to the buffer
1369  LL_BUILDER.CreateStore(
1370  group_expr_lv,
1371  LL_BUILDER.CreateGEP(
1372  group_key->getType()->getScalarType()->getPointerElementType(),
1373  group_key,
1374  LL_INT(subkey_idx++)));
1375  }
1376  }
1377  if (query_mem_desc.getQueryDescriptionType() ==
1379  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1381  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1382  } else if (query_mem_desc.getQueryDescriptionType() ==
1385  &*groups_buffer,
1386  group_key,
1387  key_size_lv,
1388  query_mem_desc,
1389  col_width_size,
1390  row_size_quad);
1391  }
1392  CHECK(false);
1393  return std::make_tuple(nullptr, nullptr);
1394 }
1395 
1398  if (!query_mem_desc.hasVarlenOutput()) {
1399  return nullptr;
1400  }
1401 
1402  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1403  auto arg_it = ROW_FUNC->arg_begin();
1404  arg_it++; /* groups_buffer */
1405  auto varlen_output_buffer = arg_it++;
1406  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1407  return varlen_output_buffer;
1408 }
1409 
1410 std::tuple<llvm::Value*, llvm::Value*>
1413  const CompilationOptions& co,
1414  llvm::Value* groups_buffer,
1415  llvm::Value* group_expr_lv_translated,
1416  llvm::Value* group_expr_lv_original,
1417  const int32_t row_size_quad) {
1418  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1419  CHECK(query_mem_desc.usesGetGroupValueFast());
1420  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1421  ? "get_columnar_group_bin_offset"
1422  : "get_group_value_fast"};
1423  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1424  get_group_fn_name += "_keyless";
1425  }
1426  if (query_mem_desc.interleavedBins(co.device_type)) {
1427  CHECK(!query_mem_desc.didOutputColumnar());
1428  CHECK(query_mem_desc.hasKeylessHash());
1429  get_group_fn_name += "_semiprivate";
1430  }
1431  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1432  &*group_expr_lv_translated};
1433  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1434  query_mem_desc.mustUseBaselineSort()) {
1435  get_group_fn_name += "_with_original_key";
1436  get_group_fn_args.push_back(group_expr_lv_original);
1437  }
1438  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1439  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1440  if (!query_mem_desc.hasKeylessHash()) {
1441  if (!query_mem_desc.didOutputColumnar()) {
1442  get_group_fn_args.push_back(LL_INT(row_size_quad));
1443  }
1444  } else {
1445  if (!query_mem_desc.didOutputColumnar()) {
1446  get_group_fn_args.push_back(LL_INT(row_size_quad));
1447  }
1448  if (query_mem_desc.interleavedBins(co.device_type)) {
1449  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1450  get_group_fn_args.push_back(warp_idx);
1451  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1452  }
1453  }
1454  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1455  return std::make_tuple(&*groups_buffer,
1456  emitCall(get_group_fn_name, get_group_fn_args));
1457  }
1458  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1459 }
1460 
1461 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1462  llvm::Value* groups_buffer,
1463  llvm::Value* group_key,
1464  llvm::Value* key_size_lv,
1465  const QueryMemoryDescriptor& query_mem_desc,
1466  const int32_t row_size_quad) {
1467  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1468  CHECK(query_mem_desc.getQueryDescriptionType() ==
1470  // compute the index (perfect hash)
1471  auto perfect_hash_func = codegenPerfectHashFunction();
1472  auto hash_lv =
1473  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1474 
1475  if (query_mem_desc.didOutputColumnar()) {
1476  if (!query_mem_desc.hasKeylessHash()) {
1477  const std::string set_matching_func_name{
1478  "set_matching_group_value_perfect_hash_columnar"};
1479  const std::vector<llvm::Value*> set_matching_func_arg{
1480  groups_buffer,
1481  hash_lv,
1482  group_key,
1483  key_size_lv,
1484  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1485  query_mem_desc.getEntryCount())};
1486  emitCall(set_matching_func_name, set_matching_func_arg);
1487  }
1488  return std::make_tuple(groups_buffer, hash_lv);
1489  } else {
1490  if (query_mem_desc.hasKeylessHash()) {
1491  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1492  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1493  nullptr);
1494  } else {
1495  return std::make_tuple(
1496  emitCall(
1497  "get_matching_group_value_perfect_hash",
1498  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1499  nullptr);
1500  }
1501  }
1502 }
1503 
1504 std::tuple<llvm::Value*, llvm::Value*>
1506  const CompilationOptions& co,
1507  llvm::Value* groups_buffer,
1508  llvm::Value* group_key,
1509  llvm::Value* key_size_lv,
1510  const QueryMemoryDescriptor& query_mem_desc,
1511  const size_t key_width,
1512  const int32_t row_size_quad) {
1513  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1514  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1515  CHECK(key_width == sizeof(int32_t));
1516  group_key =
1517  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1518  }
1519  std::vector<llvm::Value*> func_args{
1520  groups_buffer,
1521  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1522  &*group_key,
1523  &*key_size_lv,
1524  LL_INT(static_cast<int32_t>(key_width))};
1525  std::string func_name{"get_group_value"};
1526  if (query_mem_desc.didOutputColumnar()) {
1527  func_name += "_columnar_slot";
1528  } else {
1529  func_args.push_back(LL_INT(row_size_quad));
1530  }
1531  if (co.with_dynamic_watchdog) {
1532  func_name += "_with_watchdog";
1533  }
1534  if (query_mem_desc.didOutputColumnar()) {
1535  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1536  } else {
1537  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1538  }
1539 }
1540 
1542  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1543  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1544  auto ft = llvm::FunctionType::get(
1545  get_int_type(32, LL_CONTEXT),
1546  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1547  false);
1548  auto key_hash_func = llvm::Function::Create(ft,
1549  llvm::Function::ExternalLinkage,
1550  "perfect_key_hash",
1551  executor_->cgen_state_->module_);
1552  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1553  mark_function_always_inline(key_hash_func);
1554  auto& key_buff_arg = *key_hash_func->args().begin();
1555  llvm::Value* key_buff_lv = &key_buff_arg;
1556  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1557  llvm::IRBuilder<> key_hash_func_builder(bb);
1558  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1559  std::vector<int64_t> cardinalities;
1560  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1561  auto col_range_info =
1562  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1563  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1564  cardinalities.push_back(getBucketedCardinality(col_range_info));
1565  }
1566  size_t dim_idx = 0;
1567  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1568  auto* gep = key_hash_func_builder.CreateGEP(
1569  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1570  key_buff_lv,
1571  LL_INT(dim_idx));
1572  auto key_comp_lv =
1573  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1574  auto col_range_info =
1575  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1576  auto crt_term_lv =
1577  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1578  if (col_range_info.bucket) {
1579  crt_term_lv =
1580  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1581  }
1582  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1583  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1584  LL_INT(cardinalities[prev_dim_idx]));
1585  }
1586  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1587  ++dim_idx;
1588  }
1589  key_hash_func_builder.CreateRet(
1590  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1591  return key_hash_func;
1592 }
1593 
1595  const TargetInfo& agg_info,
1596  llvm::Value* target) {
1597  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1598  const auto& agg_type = agg_info.sql_type;
1599  const size_t chosen_bytes = agg_type.get_size();
1600 
1601  bool need_conversion{false};
1602  llvm::Value* arg_null{nullptr};
1603  llvm::Value* agg_null{nullptr};
1604  llvm::Value* target_to_cast{target};
1605  if (arg_type.is_fp()) {
1606  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1607  if (agg_type.is_fp()) {
1608  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1609  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1610  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1611  need_conversion = true;
1612  }
1613  } else {
1614  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1615  return target;
1616  }
1617  } else {
1618  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1619  if (agg_type.is_fp()) {
1620  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1621  need_conversion = true;
1622  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1623  } else {
1624  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1625  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1626  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1627  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1628  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1629  need_conversion = true;
1630  }
1631  }
1632  }
1633  if (need_conversion) {
1634  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1635  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1636  return LL_BUILDER.CreateSelect(
1637  cmp,
1638  agg_null,
1639  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1640  } else {
1641  return target;
1642  }
1643 }
1644 
1646  const Analyzer::WindowFunction* window_func,
1647  const QueryMemoryDescriptor& query_mem_desc,
1648  const CompilationOptions& co,
1649  DiamondCodegen& diamond_codegen) {
1650  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1651  const auto window_func_context =
1653  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1654  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1655  ? 0
1656  : query_mem_desc.getRowSize() / sizeof(int64_t);
1657  auto arg_it = ROW_FUNC->arg_begin();
1658  auto groups_buffer = arg_it++;
1659  CodeGenerator code_generator(executor_);
1660  auto window_pos_lv = code_generator.codegenWindowPosition(
1661  window_func_context, code_generator.posArg(nullptr));
1662  const auto pos_in_window =
1663  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1664  llvm::Value* entry_count_lv =
1665  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1666  std::vector<llvm::Value*> args{
1667  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1668  if (query_mem_desc.didOutputColumnar()) {
1669  const auto columnar_output_offset =
1670  emitCall("get_columnar_scan_output_offset", args);
1671  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1672  }
1673  args.push_back(LL_INT(row_size_quad));
1674  return emitCall("get_scan_output_slot", args);
1675  }
1676  auto arg_it = ROW_FUNC->arg_begin();
1677  auto groups_buffer = arg_it++;
1678  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1679 }
1680 
1682  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1683  llvm::Value* varlen_output_buffer,
1684  const std::vector<llvm::Value*>& agg_out_vec,
1685  QueryMemoryDescriptor& query_mem_desc,
1686  const CompilationOptions& co,
1687  const GpuSharedMemoryContext& gpu_smem_context,
1688  DiamondCodegen& diamond_codegen) {
1689  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1690  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1691  // TODO(alex): unify the two cases, the output for non-group by queries
1692  // should be a contiguous buffer
1693  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1694  bool can_return_error = false;
1695  if (is_group_by) {
1696  CHECK(agg_out_vec.empty());
1697  } else {
1698  CHECK(!agg_out_vec.empty());
1699  }
1700 
1701  // output buffer is casted into a byte stream to be able to handle data elements of
1702  // different sizes (only used when actual column width sizes are used)
1703  llvm::Value* output_buffer_byte_stream{nullptr};
1704  llvm::Value* out_row_idx{nullptr};
1705  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1707  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1708  std::get<0>(agg_out_ptr_w_idx),
1709  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1710  output_buffer_byte_stream->setName("out_buff_b_stream");
1711  CHECK(std::get<1>(agg_out_ptr_w_idx));
1712  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1713  llvm::Type::getInt64Ty(LL_CONTEXT));
1714  out_row_idx->setName("out_row_idx");
1715  }
1716 
1717  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1718  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1719  ++target_idx) {
1720  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1721  CHECK(target_expr);
1722 
1723  target_builder(target_expr, executor_, query_mem_desc, co);
1724  }
1725 
1726  target_builder.codegen(this,
1727  executor_,
1728  query_mem_desc,
1729  co,
1730  gpu_smem_context,
1731  agg_out_ptr_w_idx,
1732  agg_out_vec,
1733  output_buffer_byte_stream,
1734  out_row_idx,
1735  varlen_output_buffer,
1736  diamond_codegen);
1737 
1738  return can_return_error;
1739 }
1740 
1745  llvm::Value* output_buffer_byte_stream,
1746  llvm::Value* out_row_idx,
1747  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1748  const QueryMemoryDescriptor& query_mem_desc,
1749  const size_t chosen_bytes,
1750  const size_t agg_out_off,
1751  const size_t target_idx) {
1752  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1753  llvm::Value* agg_col_ptr{nullptr};
1754  if (query_mem_desc.didOutputColumnar()) {
1755  // TODO(Saman): remove the second columnar branch, and support all query description
1756  // types through the first branch. Then, input arguments should also be cleaned up
1757  if (!g_cluster &&
1759  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1760  chosen_bytes == 8);
1761  CHECK(output_buffer_byte_stream);
1762  CHECK(out_row_idx);
1763  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1764  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1765  auto out_per_col_byte_idx =
1766 #ifdef _WIN32
1767  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1768 #else
1769  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1770 #endif
1771  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1772  LL_INT(static_cast<int64_t>(col_off)));
1773  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1774  auto output_ptr = LL_BUILDER.CreateGEP(
1775  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1776  output_buffer_byte_stream,
1777  byte_offset);
1778  agg_col_ptr = LL_BUILDER.CreateBitCast(
1779  output_ptr,
1780  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1781  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1782  } else {
1783  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1784  auto const col_off = col_off_in_bytes / chosen_bytes;
1785  auto const col_rem = col_off_in_bytes % chosen_bytes;
1786  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1787  CHECK(std::get<1>(agg_out_ptr_w_idx));
1788  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1789  std::get<1>(agg_out_ptr_w_idx),
1790  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1791  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1792  auto* bit_cast = LL_BUILDER.CreateBitCast(
1793  std::get<0>(agg_out_ptr_w_idx),
1794  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1795  agg_col_ptr = LL_BUILDER.CreateGEP(
1796  bit_cast->getType()->getScalarType()->getPointerElementType(),
1797  bit_cast,
1798  offset);
1799  }
1800  } else {
1801  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1802  auto const col_off = col_off_in_bytes / chosen_bytes;
1803  auto const col_rem = col_off_in_bytes % chosen_bytes;
1804  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1805  auto* bit_cast = LL_BUILDER.CreateBitCast(
1806  std::get<0>(agg_out_ptr_w_idx),
1807  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1808  agg_col_ptr = LL_BUILDER.CreateGEP(
1809  bit_cast->getType()->getScalarType()->getPointerElementType(),
1810  bit_cast,
1811  LL_INT(col_off));
1812  }
1813  CHECK(agg_col_ptr);
1814  return agg_col_ptr;
1815 }
1816 
1817 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1818  DiamondCodegen& diamond_codegen,
1819  const QueryMemoryDescriptor& query_mem_desc,
1820  const CompilationOptions& co) {
1821  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1822  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1823  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1824  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1825  estimator_comp_count_lv);
1826  int32_t subkey_idx = 0;
1827  for (const auto& estimator_arg_comp : estimator_arg) {
1828  const auto estimator_arg_comp_lvs =
1829  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1830  query_mem_desc.getEffectiveKeyWidth(),
1831  co,
1832  false,
1833  0,
1834  diamond_codegen,
1835  array_loops,
1836  true);
1837  CHECK(!estimator_arg_comp_lvs.original_value);
1838  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1839  // store the sub-key to the buffer
1840  LL_BUILDER.CreateStore(
1841  estimator_arg_comp_lv,
1842  LL_BUILDER.CreateGEP(
1843  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1844  estimator_key_lv,
1845  LL_INT(subkey_idx++)));
1846  }
1847  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1848  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1849  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1850  const auto estimator_comp_bytes_lv =
1851  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1852  const auto bitmap_size_lv =
1853  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1854  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1855  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1856 }
1857 
1858 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1859  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1860 }
1861 
1862 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1863  const int64_t val,
1864  const int64_t skip_val) {
1865  if (val != skip_val) {
1866  agg_count_distinct(agg, val);
1867  }
1868 }
1869 
1870 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1871  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1872  t_digest->allocate();
1873  t_digest->add(val);
1874 }
1875 
1876 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1877  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1878  mode_map->add(val);
1879 }
1880 
1882  const size_t target_idx,
1883  const Analyzer::Expr* target_expr,
1884  std::vector<llvm::Value*>& agg_args,
1885  const QueryMemoryDescriptor& query_mem_desc,
1886  const ExecutorDeviceType device_type) {
1887  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1888  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1889  const auto& arg_ti =
1890  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1891  if (arg_ti.is_fp()) {
1892  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1893  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1894  }
1895  const auto& count_distinct_descriptor =
1896  query_mem_desc.getCountDistinctDescriptor(target_idx);
1897  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1898  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1899  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1900  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1901  if (device_type == ExecutorDeviceType::GPU) {
1902  const auto base_dev_addr = getAdditionalLiteral(-1);
1903  const auto base_host_addr = getAdditionalLiteral(-2);
1904  agg_args.push_back(base_dev_addr);
1905  agg_args.push_back(base_host_addr);
1906  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1907  } else {
1908  emitCall("agg_approximate_count_distinct", agg_args);
1909  }
1910  return;
1911  }
1912  std::string agg_fname{"agg_count_distinct"};
1913  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1914  agg_fname += "_bitmap";
1915  agg_args.push_back(LL_INT(count_distinct_descriptor.min_val));
1916  agg_args.push_back(LL_INT(count_distinct_descriptor.bucket_size));
1917  }
1918  if (agg_info.skip_null_val) {
1919  auto null_lv = executor_->cgen_state_->castToTypeIn(
1920  (arg_ti.is_fp()
1921  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1922  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1923  64);
1924  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1925  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1926  agg_fname += "_skip_val";
1927  agg_args.push_back(null_lv);
1928  }
1929  if (device_type == ExecutorDeviceType::GPU) {
1930  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1931  agg_fname += "_gpu";
1932  const auto base_dev_addr = getAdditionalLiteral(-1);
1933  const auto base_host_addr = getAdditionalLiteral(-2);
1934  agg_args.push_back(base_dev_addr);
1935  agg_args.push_back(base_host_addr);
1936  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1937  CHECK_EQ(size_t(0),
1938  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1939  count_distinct_descriptor.sub_bitmap_count);
1940  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1941  count_distinct_descriptor.sub_bitmap_count)));
1942  }
1943  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1944  emitCall(agg_fname, agg_args);
1945  } else {
1946  executor_->cgen_state_->emitExternalCall(
1947  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1948  }
1949 }
1950 
1952  const size_t target_idx,
1953  const Analyzer::Expr* target_expr,
1954  std::vector<llvm::Value*>& agg_args,
1955  const QueryMemoryDescriptor& query_mem_desc,
1956  const ExecutorDeviceType device_type) {
1957  if (device_type == ExecutorDeviceType::GPU) {
1958  throw QueryMustRunOnCpu();
1959  }
1960  llvm::BasicBlock *calc, *skip{nullptr};
1961  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1962  auto const arg_ti =
1963  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1964  bool const nullable = !arg_ti.get_notnull();
1965 
1966  auto* cs = executor_->cgen_state_.get();
1967  auto& irb = cs->ir_builder_;
1968  if (nullable) {
1969  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1970  auto* const skip_cond = arg_ti.is_fp()
1971  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1972  : irb.CreateICmpEQ(agg_args.back(), null_value);
1973  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1974  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1975  irb.CreateCondBr(skip_cond, skip, calc);
1976  cs->current_func_->getBasicBlockList().push_back(calc);
1977  irb.SetInsertPoint(calc);
1978  }
1979  if (!arg_ti.is_fp()) {
1980  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1981  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1982  }
1983  cs->emitExternalCall(
1984  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1985  if (nullable) {
1986  irb.CreateBr(skip);
1987  cs->current_func_->getBasicBlockList().push_back(skip);
1988  irb.SetInsertPoint(skip);
1989  }
1990 }
1991 
1992 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1993  const Analyzer::Expr* target_expr,
1994  std::vector<llvm::Value*>& agg_args,
1995  const QueryMemoryDescriptor& query_mem_desc,
1996  const ExecutorDeviceType device_type) {
1997  if (device_type == ExecutorDeviceType::GPU) {
1998  throw QueryMustRunOnCpu();
1999  }
2000  llvm::BasicBlock *calc, *skip{nullptr};
2001  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2002  auto const arg_ti =
2003  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
2004  bool const nullable = !arg_ti.get_notnull();
2005  bool const is_fp = arg_ti.is_fp();
2006  auto* cs = executor_->cgen_state_.get();
2007  auto& irb = cs->ir_builder_;
2008  if (nullable) {
2009  auto* const null_value =
2010  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
2011  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
2012  : irb.CreateICmpEQ(agg_args.back(), null_value);
2013  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
2014  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
2015  irb.CreateCondBr(skip_cond, skip, calc);
2016  cs->current_func_->getBasicBlockList().push_back(calc);
2017  irb.SetInsertPoint(calc);
2018  }
2019  if (is_fp) {
2020  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
2021  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
2022  }
2023  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
2024  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
2025  if (nullable) {
2026  irb.CreateBr(skip);
2027  cs->current_func_->getBasicBlockList().push_back(skip);
2028  irb.SetInsertPoint(skip);
2029  }
2030 }
2031 
2032 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
2033  CHECK_LT(off, 0);
2034  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
2035  auto* bit_cast = LL_BUILDER.CreateBitCast(
2036  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
2037  auto* gep =
2038  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
2039  bit_cast,
2040  LL_INT(off));
2041  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
2042 }
2043 
2044 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
2045  const Analyzer::Expr* target_expr,
2046  const CompilationOptions& co) {
2047  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2048  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2049  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2050  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2051 
2052  // TODO(alex): handle arrays uniformly?
2053  CodeGenerator code_generator(executor_);
2054  if (target_expr) {
2055  const auto& target_ti = target_expr->get_type_info();
2056  if (target_ti.is_buffer() &&
2057  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2058  const auto target_lvs =
2059  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2060  : code_generator.codegen(
2061  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2062  if (!func_expr && !arr_expr) {
2063  // Something with the chunk transport is code that was generated from a source
2064  // other than an ARRAY[] expression
2065  if (target_ti.is_text_encoding_none()) {
2066  CHECK_EQ(size_t(3), target_lvs.size());
2067  return {target_lvs[1], target_lvs[2]};
2068  }
2069  CHECK(target_ti.is_array());
2070  CHECK_EQ(size_t(1), target_lvs.size());
2071  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2072  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2073  const auto i8p_ty =
2074  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2075  const auto& elem_ti = target_ti.get_elem_type();
2076  return {
2077  executor_->cgen_state_->emitExternalCall(
2078  "array_buff",
2079  i8p_ty,
2080  {target_lvs.front(), code_generator.posArg(target_expr)}),
2081  executor_->cgen_state_->emitExternalCall(
2082  "array_size",
2083  i32_ty,
2084  {target_lvs.front(),
2085  code_generator.posArg(target_expr),
2086  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2087  } else {
2088  if (agg_expr) {
2089  throw std::runtime_error(
2090  "Using array[] operator as argument to an aggregate operator is not "
2091  "supported");
2092  }
2093  CHECK(func_expr || arr_expr);
2094  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2095  CHECK_EQ(size_t(1), target_lvs.size());
2096  const auto prefix = target_ti.get_buffer_name();
2097  CHECK(target_ti.is_array() || target_ti.is_text_encoding_none());
2098  const auto target_lv = LL_BUILDER.CreateLoad(
2099  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2100  // const auto target_lv_type = target_lvs[0]->getType();
2101  // CHECK(target_lv_type->isStructTy());
2102  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2103  const auto i8p_ty = llvm::PointerType::get(
2104  get_int_type(8, executor_->cgen_state_->context_), 0);
2105  const auto ptr = LL_BUILDER.CreatePointerCast(
2106  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2107  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2108  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2109  const auto nullcheck_ok_bb =
2110  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2111  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2112  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2113 
2114  // TODO(adb): probably better to zext the bool
2115  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2116  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2117  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2118 
2119  const auto ret_bb =
2120  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2121  LL_BUILDER.SetInsertPoint(ret_bb);
2122  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2123  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2124  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2125  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2126  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2127  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2128  executor_->cgen_state_->emitExternalCall(
2129  "register_buffer_with_executor_rsm",
2130  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2131  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2132  LL_BUILDER.CreateBr(ret_bb);
2133  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2134  LL_BUILDER.CreateBr(ret_bb);
2135 
2136  LL_BUILDER.SetInsertPoint(ret_bb);
2137  return {result_phi, size};
2138  }
2139  CHECK_EQ(size_t(2), target_lvs.size());
2140  return {target_lvs[0], target_lvs[1]};
2141  }
2142  }
2143  if (target_ti.is_geometry() &&
2144  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2145  auto generate_coord_lvs =
2146  [&](auto* selected_target_expr,
2147  bool const fetch_columns) -> std::vector<llvm::Value*> {
2148  const auto target_lvs =
2149  code_generator.codegen(selected_target_expr, fetch_columns, co);
2150  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2151  target_expr->get_type_info().is_geometry()) {
2152  // return a pointer to the temporary alloca
2153  return target_lvs;
2154  }
2155  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2156  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2157  if (geo_uoper || geo_binoper) {
2158  CHECK(target_expr->get_type_info().is_geometry());
2159  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2160  target_lvs.size());
2161  return target_lvs;
2162  }
2163  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2164  target_lvs.size());
2165 
2166  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2167  const auto i8p_ty =
2168  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2169  std::vector<llvm::Value*> coords;
2170  size_t ctr = 0;
2171  for (const auto& target_lv : target_lvs) {
2172  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2173  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2174  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2175  // coords array (TINYINT). Subsequent arrays are regular INT.
2176 
2177  const size_t elem_sz = ctr == 0 ? 1 : 4;
2178  ctr++;
2179  int32_t fixlen = -1;
2180  if (target_ti.get_type() == kPOINT) {
2181  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2182  if (col_var) {
2183  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2184  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2185  fixlen = coords_cd->columnType.get_size();
2186  }
2187  }
2188  }
2189  if (fixlen > 0) {
2190  coords.push_back(executor_->cgen_state_->emitExternalCall(
2191  "fast_fixlen_array_buff",
2192  i8p_ty,
2193  {target_lv, code_generator.posArg(selected_target_expr)}));
2194  auto fixed_len_lv = executor_->cgen_state_->emitExternalCall(
2195  "determine_fixed_array_len",
2196  llvm::IntegerType::get(code_generator.cgen_state_->context_, 64),
2197  {target_lv, executor_->cgen_state_->llInt(int64_t(fixlen))});
2198  coords.push_back(fixed_len_lv);
2199  continue;
2200  }
2201  coords.push_back(executor_->cgen_state_->emitExternalCall(
2202  "array_buff",
2203  i8p_ty,
2204  {target_lv, code_generator.posArg(selected_target_expr)}));
2205  coords.push_back(executor_->cgen_state_->emitExternalCall(
2206  "array_size",
2207  i32_ty,
2208  {target_lv,
2209  code_generator.posArg(selected_target_expr),
2210  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2211  }
2212  return coords;
2213  };
2214 
2215  if (agg_expr) {
2216  return generate_coord_lvs(agg_expr->get_arg(), true);
2217  } else {
2218  return generate_coord_lvs(target_expr,
2219  !executor_->plan_state_->allow_lazy_fetch_);
2220  }
2221  }
2222  }
2223  bool fetch_column = !executor_->plan_state_->allow_lazy_fetch_;
2224  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2225  : code_generator.codegen(target_expr, fetch_column, co);
2226 }
2227 
2228 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2229  const std::vector<llvm::Value*>& args) {
2230  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2231  return executor_->cgen_state_->emitCall(fname, args);
2232 }
2233 
2234 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2235  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2236  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2237  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2238  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2239 
2240  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2241 }
2242 
2243 #undef CUR_FUNC
2244 #undef ROW_FUNC
2245 #undef LL_FP
2246 #undef LL_INT
2247 #undef LL_BOOL
2248 #undef LL_BUILDER
2249 #undef LL_CONTEXT
2250 
2252  const RelAlgExecutionUnit& ra_exe_unit) {
2253  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2254  return 0;
2255  }
2256  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2257  const auto grouped_col_expr =
2258  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2259  if (!grouped_col_expr) {
2260  continue;
2261  }
2262  const auto& column_key = grouped_col_expr->getColumnKey();
2263  if (column_key.table_id <= 0) {
2264  return 0;
2265  }
2267  {column_key.db_id, column_key.table_id});
2268  if (td->shardedColumnId == column_key.column_id) {
2269  return td->nShards;
2270  }
2271  }
2272  return 0;
2273 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
GroupByPerfectHash
Definition: enums.h:58
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
ApproxQuantileDescriptors initApproxQuantileDescriptors()
#define CHECK_EQ(x, y)
Definition: Logger.h:301
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2925
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
#define LL_BUILDER
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
NonGroupedAggregate
Definition: enums.h:58
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
CgenState * cgen_state_
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:573
ColRangeInfo getColRangeInfo()
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:590
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const ApproxQuantileDescriptors &, const CountDistinctDescriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint, const bool threads_can_reuse_group_by_buffers)
const TableDescriptor * get_metadata_for_table(const ::shared::TableKey &table_key, bool populate_fragmenter)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1330
Projection
Definition: enums.h:58
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
std::vector< ApproxQuantileDescriptor > ApproxQuantileDescriptors
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:78
const SQLTypeInfo get_compact_type(const TargetInfo &target)
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:235
llvm::LLVMContext & context_
Definition: CgenState.h:382
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
executor_(executor)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:80
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
size_t g_approx_quantile_buffer
Definition: Execute.cpp:171
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:633
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:61
#define CHECK_LT(x, y)
Definition: Logger.h:303
Definition: sqltypes.h:80
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:304
bool expr_is_rowid(const Analyzer::Expr *expr)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:81
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
GroupByBaselineHash
Definition: enums.h:58
Descriptor for the result set buffer layout.
#define MAX_BUFFER_SIZE
TO bit_cast(FROM &&from)
Definition: misc.h:307
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
size_t g_default_max_groups_buffer_entry_guess
Definition: Execute.cpp:118
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:597
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
size_t g_approx_quantile_centroids
Definition: Execute.cpp:172
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1448
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:198
Definition: sqltypes.h:72
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:79
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398
const RelAlgExecutionUnit & ra_exe_unit_
Definition: sqldefs.h:79
Definition: sqldefs.h:77
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals