OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{GroupByAndAggregate.cpp} Namespace Reference

Functions

int32_t get_agg_count (const std::vector< Analyzer::Expr * > &target_exprs)
 
bool expr_is_rowid (const Analyzer::Expr *expr)
 
bool has_count_distinct (const RelAlgExecutionUnit &ra_exe_unit)
 
bool is_column_range_too_big_for_perfect_hash (const ColRangeInfo &col_range_info, const int64_t max_entry_count)
 
bool cardinality_estimate_less_than_column_range (const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
 
ColRangeInfo get_expr_range_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
 
int64_t get_bucketed_cardinality_without_nulls (const ColRangeInfo &col_range_info)
 
KeylessInfo get_keyless_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
 
CountDistinctDescriptors init_count_distinct_descriptors (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
 

Function Documentation

bool anonymous_namespace{GroupByAndAggregate.cpp}::cardinality_estimate_less_than_column_range ( const int64_t  cardinality_estimate,
const ColRangeInfo col_range_info 
)

Definition at line 161 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

162  {
163  try {
164  // the cardinality estimate is the size of the baseline hash table. further penalize
165  // the baseline hash table by a factor of 2x due to overhead in computing baseline
166  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
167  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
168  // count of the column, we use baseline hash on the filtered set
169  return checked_int64_t(cardinality_estimate) * 2 <
170  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
171  checked_int64_t(col_range_info.min));
172  } catch (...) {
173  return false;
174  }
175 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::expr_is_rowid ( const Analyzer::Expr expr)

Definition at line 128 of file GroupByAndAggregate.cpp.

References CHECK_EQ, and get_column_descriptor_maybe().

Referenced by GroupByAndAggregate::getColRangeInfo().

128  {
129  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
130  if (!col) {
131  return false;
132  }
133  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
134  if (!cd || !cd->isVirtualCol) {
135  return false;
136  }
137  CHECK_EQ("rowid", cd->columnName);
138  return true;
139 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int32_t anonymous_namespace{GroupByAndAggregate.cpp}::get_agg_count ( const std::vector< Analyzer::Expr * > &  target_exprs)

Definition at line 103 of file GroupByAndAggregate.cpp.

References agg_count(), CHECK, Analyzer::Expr::get_type_info(), kAVG, and kSAMPLE.

Referenced by GroupByAndAggregate::codegen().

103  {
104  int32_t agg_count{0};
105  for (auto target_expr : target_exprs) {
106  CHECK(target_expr);
107  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
108  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
109  const auto& ti = target_expr->get_type_info();
110  if (ti.is_buffer()) {
111  agg_count += 2;
112  } else if (ti.is_geometry()) {
113  agg_count += ti.get_physical_coord_cols() * 2;
114  } else {
115  ++agg_count;
116  }
117  continue;
118  }
119  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
120  agg_count += 2;
121  } else {
122  ++agg_count;
123  }
124  }
125  return agg_count;
126 }
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
#define CHECK(condition)
Definition: Logger.h:291
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
Definition: sqldefs.h:77

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t anonymous_namespace{GroupByAndAggregate.cpp}::get_bucketed_cardinality_without_nulls ( const ColRangeInfo col_range_info)

Definition at line 368 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, ColRangeInfo::max, and ColRangeInfo::min.

Referenced by init_count_distinct_descriptors().

368  {
369  if (col_range_info.min <= col_range_info.max) {
370  size_t size = col_range_info.max - col_range_info.min;
371  if (col_range_info.bucket) {
372  size /= col_range_info.bucket;
373  }
374  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
375  // try to use unordered_set instead of crashing due to CHECK failure
376  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
377  return 0;
378  }
379  return static_cast<int64_t>(size + 1);
380  } else {
381  return 0;
382  }
383 }

+ Here is the caller graph for this function:

ColRangeInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const Analyzer::Expr expr,
Executor executor 
)

Definition at line 177 of file GroupByAndAggregate.cpp.

References CHECK, Double, Float, getExpressionRange(), heavyai::GroupByBaselineHash, heavyai::GroupByPerfectHash, Integer, Invalid, heavyai::NonGroupedAggregate, heavyai::Projection, and RelAlgExecutionUnit::simple_quals.

Referenced by GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenPerfectHashFunction(), GroupByAndAggregate::getColRangeInfo(), GroupByAndAggregate::gpuCanHandleOrderEntries(), and init_count_distinct_descriptors().

180  {
181  if (!expr) {
182  return {QueryDescriptionType::Projection, 0, 0, 0, false};
183  }
184 
185  const auto expr_range = getExpressionRange(
186  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
187  switch (expr_range.getType()) {
189  if (expr_range.getIntMin() > expr_range.getIntMax()) {
190  return {
191  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
192  }
194  expr_range.getIntMin(),
195  expr_range.getIntMax(),
196  expr_range.getBucket(),
197  expr_range.hasNulls()};
198  }
201  if (expr_range.getFpMin() > expr_range.getFpMax()) {
202  return {
203  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
204  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  }
208  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
209  default:
210  CHECK(false);
211  }
212  CHECK(false);
213  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
214 }
GroupByPerfectHash
Definition: enums.h:58
NonGroupedAggregate
Definition: enums.h:58
Projection
Definition: enums.h:58
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
GroupByBaselineHash
Definition: enums.h:58
#define CHECK(condition)
Definition: Logger.h:291
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

KeylessInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_keyless_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const bool  is_group_by,
Executor executor 
)

This function goes through all target expressions and answers two questions:

  1. Is it possible to have keyless hash?
  2. If yes to 1, then what aggregate expression should be considered to represent the key's presence, if needed (e.g., in detecting empty entries in the result set).

NOTE: Keyless hash is only valid with single-column group by at the moment.

Definition at line 478 of file GroupByAndAggregate.cpp.

References agg_arg(), CHECK, constrained_not_null(), Double, Float, g_bigint_count, get_agg_initial_val(), get_compact_type(), get_target_info(), getExpressionRange(), Integer, Invalid, is_distinct_target(), kAVG, kCOUNT, kMAX, kMIN, kSUM, RelAlgExecutionUnit::quals, takes_float_argument(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

481  {
482  bool keyless{true}, found{false};
483  int32_t num_agg_expr{0};
484  int32_t index{0};
485  for (const auto target_expr : ra_exe_unit.target_exprs) {
486  const auto agg_info = get_target_info(target_expr, g_bigint_count);
487  const auto chosen_type = get_compact_type(agg_info);
488  if (agg_info.is_agg) {
489  num_agg_expr++;
490  }
491  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
492  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
493  CHECK(agg_expr);
494  const auto arg_expr = agg_arg(target_expr);
495  const bool float_argument_input = takes_float_argument(agg_info);
496  switch (agg_info.agg_kind) {
497  case kAVG:
498  ++index;
499  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
500  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
501  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
502  expr_range_info.hasNulls()) {
503  break;
504  }
505  }
506  found = true;
507  break;
508  case kCOUNT:
509  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
510  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
511  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
512  expr_range_info.hasNulls()) {
513  break;
514  }
515  }
516  found = true;
517  break;
518  case kSUM: {
519  auto arg_ti = arg_expr->get_type_info();
520  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
521  arg_ti.set_notnull(true);
522  }
523  if (!arg_ti.get_notnull()) {
524  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
525  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
526  !expr_range_info.hasNulls()) {
527  found = true;
528  }
529  } else {
530  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
531  switch (expr_range_info.getType()) {
534  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
535  found = true;
536  }
537  break;
539  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
540  found = true;
541  }
542  break;
543  default:
544  break;
545  }
546  }
547  break;
548  }
549  case kMIN: {
550  CHECK(agg_expr && agg_expr->get_arg());
551  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
552  if (arg_ti.is_string() || arg_ti.is_buffer()) {
553  break;
554  }
555  auto expr_range_info =
556  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
557  auto init_max = get_agg_initial_val(agg_info.agg_kind,
558  chosen_type,
559  is_group_by || float_argument_input,
560  float_argument_input ? sizeof(float) : 8);
561  switch (expr_range_info.getType()) {
564  auto double_max =
565  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
566  if (expr_range_info.getFpMax() < double_max) {
567  found = true;
568  }
569  break;
570  }
572  if (expr_range_info.getIntMax() < init_max) {
573  found = true;
574  }
575  break;
576  default:
577  break;
578  }
579  break;
580  }
581  case kMAX: {
582  CHECK(agg_expr && agg_expr->get_arg());
583  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
584  if (arg_ti.is_string() || arg_ti.is_buffer()) {
585  break;
586  }
587  auto expr_range_info =
588  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
589  // NULL sentinel and init value for kMAX are identical, which results in
590  // ambiguity in detecting empty keys in presence of nulls.
591  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
592  expr_range_info.hasNulls()) {
593  break;
594  }
595  auto init_min = get_agg_initial_val(agg_info.agg_kind,
596  chosen_type,
597  is_group_by || float_argument_input,
598  float_argument_input ? sizeof(float) : 8);
599  switch (expr_range_info.getType()) {
602  auto double_min =
603  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
604  if (expr_range_info.getFpMin() > double_min) {
605  found = true;
606  }
607  break;
608  }
610  if (expr_range_info.getIntMin() > init_min) {
611  found = true;
612  }
613  break;
614  default:
615  break;
616  }
617  break;
618  }
619  default:
620  keyless = false;
621  break;
622  }
623  }
624  if (!keyless) {
625  break;
626  }
627  if (!found) {
628  ++index;
629  }
630  }
631 
632  // shouldn't use keyless for projection only
633  return {
634  keyless && found,
635  index,
636  };
637 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
Definition: sqldefs.h:78
const SQLTypeInfo get_compact_type(const TargetInfo &target)
bool g_bigint_count
Definition: sqldefs.h:80
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
Definition: sqldefs.h:81
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:79
Definition: sqldefs.h:77

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::has_count_distinct ( const RelAlgExecutionUnit ra_exe_unit)

Definition at line 141 of file GroupByAndAggregate.cpp.

References g_bigint_count, get_target_info(), is_distinct_target(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::getColRangeInfo().

141  {
142  for (const auto& target_expr : ra_exe_unit.target_exprs) {
143  const auto agg_info = get_target_info(target_expr, g_bigint_count);
144  if (agg_info.is_agg && is_distinct_target(agg_info)) {
145  return true;
146  }
147  }
148  return false;
149 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CountDistinctDescriptors anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const ColRangeInfo group_by_range_info,
const ExecutorDeviceType  device_type,
Executor executor 
)

Definition at line 639 of file GroupByAndAggregate.cpp.

References align_to_int64(), Bitmap, ColRangeInfo::bucket, CHECK, CHECK_GE, Analyzer::ColumnVar::collect_column_var(), Analyzer::ColumnVar::colvar_comp(), g_bigint_count, g_bitmap_memory_limit, g_enable_watchdog, g_hll_precision_bits, Analyzer::AggExpr::get_arg(), get_bucketed_cardinality_without_nulls(), get_count_distinct_sub_bitmap_count(), DateConverters::get_epoch_days_from_seconds(), get_expr_range_info(), get_target_info(), Analyzer::Expr::get_type_info(), GPU, heavyai::GroupByPerfectHash, hll_size_for_rate(), Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, kDATE, kENCODING_DATE_IN_DAYS, kINT, ColRangeInfo::max, ColRangeInfo::min, heavyai::Projection, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_original_type_infos, and UnorderedSet.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

644  {
645  CountDistinctDescriptors count_distinct_descriptors;
646  auto compute_bytes_per_group =
647  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
648  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
649  const auto padded_size =
650  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
651  ? align_to_int64(effective_size_bytes)
652  : effective_size_bytes;
653  return padded_size * sub_bitmap_count;
654  };
655  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
656  const auto target_expr = ra_exe_unit.target_exprs[i];
657  auto agg_info = get_target_info(target_expr, g_bigint_count);
658  if (is_distinct_target(agg_info)) {
659  CHECK(agg_info.is_agg);
660  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
661  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
662  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
663  if (arg_ti.is_text_encoding_none()) {
664  throw std::runtime_error(
665  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
666  }
667  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
668  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
669  }
670  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
671  throw std::runtime_error(
672  "APPROX_COUNT_DISTINCT on geometry columns not supported");
673  }
674  if (agg_info.is_distinct && arg_ti.is_geometry()) {
675  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
676  }
677  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
678  auto arg_range_info =
679  arg_ti.is_fp() ? no_range_info
681  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
682  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
683  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
684  const auto& original_target_expr_ti = it->second;
685  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
686  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
687  // manually encode the col range of date col if necessary
688  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
689  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
690  if (original_target_expr_ti.get_comp_param() == 16) {
691  return date_val < INT16_MIN || date_val > INT16_MAX;
692  } else {
693  return date_val < INT32_MIN || date_val > INT32_MIN;
694  }
695  };
696  if (is_date_value_not_encoded(arg_range_info.min)) {
697  // chunk metadata of the date column contains decoded value
698  // so we manually encode it again here to represent its column range correctly
699  arg_range_info.min =
701  }
702  if (is_date_value_not_encoded(arg_range_info.max)) {
703  arg_range_info.max =
705  }
706  // now we manually encode the value, so we need to invalidate bucket value
707  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
708  arg_range_info.bucket = 0;
709  }
710  }
711 
713  int64_t bitmap_sz_bits{0};
714  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
715  const auto error_rate_expr = agg_expr->get_arg1();
716  if (error_rate_expr) {
717  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
718  auto const error_rate =
719  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
720  CHECK(error_rate);
721  CHECK_GE(error_rate->get_constval().intval, 1);
722  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
723  } else {
724  bitmap_sz_bits = g_hll_precision_bits;
725  }
726  }
727  if (arg_range_info.isEmpty()) {
728  count_distinct_descriptors.emplace_back(
730  0,
731  arg_range_info.bucket,
732  64,
733  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
734  device_type,
735  1});
736  continue;
737  }
738  const auto sub_bitmap_count =
739  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
740  size_t worst_case_num_groups{1};
741  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
742  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
743  // implementation for arrays
744  count_distinct_impl_type = CountDistinctImplType::Bitmap;
745  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
746  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
747  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
748  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
749  }
750  // check a potential OOM when using bitmap-based approach
751  const auto total_bytes_per_entry =
752  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
753  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
754  const auto maximum_num_groups =
755  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
756  const auto total_bitmap_bytes_for_groups =
757  total_bytes_per_entry * maximum_num_groups;
758  // we can estimate a potential OOM of bitmap-based count-distinct operator
759  // by using the logic "check_total_bitmap_memory"
760  if (total_bitmap_bytes_for_groups >=
761  static_cast<size_t>(g_bitmap_memory_limit)) {
762  const auto agg_expr_max_entry_count =
763  arg_range_info.max - arg_range_info.min + 1;
764  int64_t max_agg_expr_table_cardinality{1};
765  std::set<const Analyzer::ColumnVar*,
766  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
768  agg_expr->collect_column_var(colvar_set, true);
769  for (const auto cv : colvar_set) {
770  auto it =
771  std::find_if(query_infos.begin(),
772  query_infos.end(),
773  [&](const auto& input_table_info) {
774  return input_table_info.table_key == cv->getTableKey();
775  });
776  int64_t cur_table_cardinality =
777  it != query_infos.end()
778  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
779  : -1;
780  max_agg_expr_table_cardinality =
781  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
782  worst_case_num_groups *= cur_table_cardinality;
783  }
784  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
785  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
786  };
787  // if we have valid stats regarding input expr, we can try to relax the OOM
788  if (has_valid_stat()) {
789  // a threshold related to a ratio of a range of agg expr (let's say R)
790  // and table cardinality (C), i.e., use unordered_set if the # bits to build
791  // a bitmap based on R is four times larger than that of C
792  const size_t unordered_set_threshold{2};
793  // When we detect OOM of bitmap-based approach we selectively switch it to
794  // hash set-based processing logic if one of the followings is satisfied:
795  // 1) the column range is too wide compared with the table cardinality, or
796  // 2) the column range is too wide compared with the avg of # unique values
797  // per group by entry
798  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
799  const auto bits_for_agg_table =
800  std::ceil(log(max_agg_expr_table_cardinality));
801  const auto avg_num_unique_entries_per_group =
802  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
803  // case a) given a range of entry count of agg_expr and the maximum
804  // cardinality among source tables of the agg_expr , we try to detect the
805  // misleading case of too sparse column range , i.e., agg_expr has 1M column
806  // range but only has two tuples {1 and 1M} / case b) check whether
807  // using bitmap is really beneficial when considering uniform distribution
808  // of (unique) keys.
809  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
810  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
811  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
812  } else {
813  throw std::runtime_error(
814  "Consider using approx_count_distinct operator instead of "
815  "count_distinct operator to lower the memory "
816  "requirements");
817  }
818  }
819  }
820  }
821  }
822  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
823  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
824  !(arg_ti.is_array() || arg_ti.is_geometry())) {
825  count_distinct_impl_type = CountDistinctImplType::Bitmap;
826  }
827  const size_t too_many_entries{100000000};
828  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
829  worst_case_num_groups > too_many_entries &&
830  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
831  throw WatchdogException(
832  "Detect too many input entries for set-based count distinct operator under "
833  "the watchdog");
834  }
835  count_distinct_descriptors.emplace_back(
836  CountDistinctDescriptor{count_distinct_impl_type,
837  arg_range_info.min,
838  arg_range_info.bucket,
839  bitmap_sz_bits,
840  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
841  device_type,
842  sub_bitmap_count});
843  } else {
844  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
845  CountDistinctImplType::Invalid, 0, 0, 0, false, device_type, 0});
846  }
847  }
848  return count_distinct_descriptors;
849 }
GroupByPerfectHash
Definition: enums.h:58
std::vector< Analyzer::Expr * > target_exprs
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
#define CHECK_GE(x, y)
Definition: Logger.h:306
Expr * get_arg() const
Definition: Analyzer.h:1330
Projection
Definition: enums.h:58
int g_hll_precision_bits
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
bool g_bigint_count
bool g_enable_watchdog
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
Definition: sqltypes.h:80
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:81
CountDistinctImplType
#define CHECK(condition)
Definition: Logger.h:291
int64_t get_epoch_days_from_seconds(const int64_t seconds)
Definition: sqltypes.h:72
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::is_column_range_too_big_for_perfect_hash ( const ColRangeInfo col_range_info,
const int64_t  max_entry_count 
)

Definition at line 151 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

152  {
153  try {
154  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
155  checked_int64_t(col_range_info.min)) >= max_entry_count;
156  } catch (...) {
157  return true;
158  }
159 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function: