_group_by_and_aggregate_8h_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #ifndef QUERYENGINE_GROUPBYANDAGGREGATE_H

 #define QUERYENGINE_GROUPBYANDAGGREGATE_H


 #include "BufferCompaction.h"

 #include "ColumnarResults.h"

 #include "CompilationOptions.h"

 #include "GpuMemUtils.h"

 #include "GpuSharedMemoryContext.h"

 #include "InputMetadata.h"

 #include "QueryExecutionContext.h"

 #include "Rendering/RenderInfo.h"

 #include "RuntimeFunctions.h"


 #include "QueryEngine/Utils/DiamondCodegen.h"


 #include "../Shared/sqltypes.h"

 #include "Logger/Logger.h"


 #include <llvm/IR/Function.h>

 #include <llvm/IR/Instructions.h>

 #include <llvm/IR/Value.h>

 #include <boost/algorithm/string/join.hpp>

 #include <boost/make_unique.hpp>


 #include <stack>

 #include <vector>


 extern bool g_enable_smem_group_by;

 extern bool g_bigint_count;


 struct ColRangeInfo {

   QueryDescriptionType hash_type_;

   int64_t min;

   int64_t max;

   int64_t bucket;

   bool has_nulls;

   bool isEmpty() const;

 };


 struct KeylessInfo {

   const bool keyless;

   const int32_t target_index;

 };


 class GroupByAndAggregate {

  public:

   GroupByAndAggregate(Executor* executor,

                       const ExecutorDeviceType device_type,

                       const RelAlgExecutionUnit& ra_exe_unit,

                       const std::vector<InputTableInfo>& query_infos,

                       std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,

                       const std::optional<int64_t>& group_cardinality_estimation);


   // returns true iff checking the error code after every row

   // is required -- slow path group by queries for now

   bool codegen(llvm::Value* filter_result,

                llvm::BasicBlock* sc_false,

                QueryMemoryDescriptor& query_mem_desc,

                const CompilationOptions& co,

                const GpuSharedMemoryContext& gpu_smem_context);


   static size_t shard_count_for_top_groups(const RelAlgExecutionUnit& ra_exe_unit);


  private:

   bool gpuCanHandleOrderEntries(const std::list<Analyzer::OrderEntry>& order_entries);


   ApproxQuantileDescriptors initApproxQuantileDescriptors();


   std::unique_ptr<QueryMemoryDescriptor> initQueryMemoryDescriptor(

       const bool allow_multifrag,

       const size_t max_groups_buffer_entry_count,

       const int8_t crt_min_byte_width,

       RenderInfo* render_info,

       const bool output_columnar_hint);


   std::unique_ptr<QueryMemoryDescriptor> initQueryMemoryDescriptorImpl(

       const bool allow_multifrag,

       const size_t max_groups_buffer_entry_count,

       const int8_t crt_min_byte_width,

       const bool sort_on_gpu_hint,

       RenderInfo* render_info,

       const bool must_use_baseline_sort,

       const bool output_columnar_hint);


   int64_t getShardedTopBucket(const ColRangeInfo& col_range_info,

                               const size_t shard_count) const;


   llvm::Value* codegenOutputSlot(llvm::Value* groups_buffer,

                                  const QueryMemoryDescriptor& query_mem_desc,

                                  const CompilationOptions& co,

                                  DiamondCodegen& diamond_codegen);


   std::tuple<llvm::Value*, llvm::Value*> codegenGroupBy(

       const QueryMemoryDescriptor& query_mem_desc,

       const CompilationOptions& co,

       DiamondCodegen& codegen);


   llvm::Value* codegenVarlenOutputBuffer(const QueryMemoryDescriptor& query_mem_desc);


   std::tuple<llvm::Value*, llvm::Value*> codegenSingleColumnPerfectHash(

       const QueryMemoryDescriptor& query_mem_desc,

       const CompilationOptions& co,

       llvm::Value* groups_buffer,

       llvm::Value* group_expr_lv_translated,

       llvm::Value* group_expr_lv_original,

       const int32_t row_size_quad);


   std::tuple<llvm::Value*, llvm::Value*> codegenMultiColumnPerfectHash(

       llvm::Value* groups_buffer,

       llvm::Value* group_key,

       llvm::Value* key_size_lv,

       const QueryMemoryDescriptor& query_mem_desc,

       const int32_t row_size_quad);

   llvm::Function* codegenPerfectHashFunction();


   std::tuple<llvm::Value*, llvm::Value*> codegenMultiColumnBaselineHash(

       const CompilationOptions& co,

       llvm::Value* groups_buffer,

       llvm::Value* group_key,

       llvm::Value* key_size_lv,

       const QueryMemoryDescriptor& query_mem_desc,

       const size_t key_width,

       const int32_t row_size_quad);


   ColRangeInfo getColRangeInfo();


   static int64_t getBucketedCardinality(const ColRangeInfo& col_range_info);


   llvm::Value* convertNullIfAny(const SQLTypeInfo& arg_type,

                                 const TargetInfo& agg_info,

                                 llvm::Value* target);


   bool codegenAggCalls(const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,

                        llvm::Value* varlen_output_buffer,

                        const std::vector<llvm::Value*>& agg_out_vec,

                        QueryMemoryDescriptor& query_mem_desc,

                        const CompilationOptions& co,

                        const GpuSharedMemoryContext& gpu_smem_context,

                        DiamondCodegen& diamond_codegen);


   llvm::Value* codegenWindowRowPointer(const Analyzer::WindowFunction* window_func,

                                        const QueryMemoryDescriptor& query_mem_desc,

                                        const CompilationOptions& co,

                                        DiamondCodegen& diamond_codegen);


   llvm::Value* codegenAggColumnPtr(

       llvm::Value* output_buffer_byte_stream,

       llvm::Value* out_row_idx,

       const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,

       const QueryMemoryDescriptor& query_mem_desc,

       const size_t chosen_bytes,

       const size_t agg_out_off,

       const size_t target_idx);


   void codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,

                         DiamondCodegen& diamond_codegen,

                         const QueryMemoryDescriptor& query_mem_desc,

                         const CompilationOptions&);


   void codegenCountDistinct(const size_t target_idx,

                             const Analyzer::Expr* target_expr,

                             std::vector<llvm::Value*>& agg_args,

                             const QueryMemoryDescriptor&,

                             const ExecutorDeviceType);


   void codegenApproxQuantile(const size_t target_idx,

                              const Analyzer::Expr* target_expr,

                              std::vector<llvm::Value*>& agg_args,

                              const QueryMemoryDescriptor& query_mem_desc,

                              const ExecutorDeviceType device_type);


   void codegenMode(const size_t target_idx,

                    const Analyzer::Expr* target_expr,

                    std::vector<llvm::Value*>& agg_args,

                    const QueryMemoryDescriptor& query_mem_desc,

                    const ExecutorDeviceType device_type);


   llvm::Value* getAdditionalLiteral(const int32_t off);


   std::vector<llvm::Value*> codegenAggArg(const Analyzer::Expr* target_expr,

                                           const CompilationOptions& co);


   llvm::Value* emitCall(const std::string& fname, const std::vector<llvm::Value*>& args);


   void checkErrorCode(llvm::Value* retCode);


   bool needsUnnestDoublePatch(llvm::Value const* val_ptr,

                               const std::string& agg_base_name,

                               const bool threads_share_memory,

                               const CompilationOptions& co) const;


   void prependForceSync();


   Executor* executor_;

   const RelAlgExecutionUnit& ra_exe_unit_;

   const std::vector<InputTableInfo>& query_infos_;

   std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner_;

   bool output_columnar_;

   const ExecutorDeviceType device_type_;


   const std::optional<int64_t> group_cardinality_estimation_;


   friend class Executor;

   friend class QueryMemoryDescriptor;

   friend class CodeGenerator;

   friend class ExecutionKernel;

   friend struct TargetExprCodegen;

   friend struct TargetExprCodegenBuilder;

 };


 inline size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits,

                                                   const RelAlgExecutionUnit& ra_exe_unit,

                                                   const ExecutorDeviceType device_type) {

   // For count distinct on a column with a very small number of distinct values

   // contention can be very high, especially for non-grouped queries. We'll split

   // the bitmap into multiple sub-bitmaps which are unified to get the full result.

   // The threshold value for bitmap_sz_bits works well on Kepler.

   return bitmap_sz_bits < 50000 && ra_exe_unit.groupby_exprs.empty() &&

                  (device_type == ExecutorDeviceType::GPU || g_cluster)

              ? 64  // NB: must be a power of 2 to keep runtime offset computations cheap

              : 1;

 }


 #endif  // QUERYENGINE_GROUPBYANDAGGREGATE_H

CompilationOptions
Definition: CompilationOptions.h:31

TargetExprCodegenBuilder::ra_exe_unit
const RelAlgExecutionUnit & ra_exe_unit
Definition: TargetExprBuilder.h:144

GroupByAndAggregate::initApproxQuantileDescriptors
ApproxQuantileDescriptors initApproxQuantileDescriptors()
Definition: GroupByAndAggregate.cpp:894

Analyzer::WindowFunction
Definition: Analyzer.h:2860

GroupByAndAggregate::Executor
friend class Executor
Definition: GroupByAndAggregate.h:219

g_enable_smem_group_by
bool g_enable_smem_group_by

GroupByAndAggregate::gpuCanHandleOrderEntries
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
Definition: GroupByAndAggregate.cpp:1004

GroupByAndAggregate::getBucketedCardinality
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
Definition: GroupByAndAggregate.cpp:356

GroupByAndAggregate::getAdditionalLiteral
llvm::Value * getAdditionalLiteral(const int32_t off)
Definition: GroupByAndAggregate.cpp:2032

GroupByAndAggregate::codegenAggColumnPtr
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
Definition: GroupByAndAggregate.cpp:1744

run_benchmark_import.args
tuple args
Definition: run_benchmark_import.py:247

CodeGenerator
Definition: CodeGenerator.h:29

GroupByAndAggregate::codegen
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
Definition: GroupByAndAggregate.cpp:1047

KeylessInfo::keyless
const bool keyless
Definition: GroupByAndAggregate.h:57

GroupByAndAggregate::initQueryMemoryDescriptorImpl
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
Definition: GroupByAndAggregate.cpp:913

GpuSharedMemoryContext.h

ColRangeInfo::bucket
int64_t bucket
Definition: GroupByAndAggregate.h:51

GroupByAndAggregate::codegenMode
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: GroupByAndAggregate.cpp:1992

TargetInfo
Definition: TargetInfo.h:49

QueryMemoryDescriptor
Definition: QueryMemoryDescriptor.h:68

Analyzer::Expr
Definition: Analyzer.h:68

GroupByAndAggregate::getColRangeInfo
ColRangeInfo getColRangeInfo()
Definition: GroupByAndAggregate.cpp:218

ColRangeInfo::hash_type_
QueryDescriptionType hash_type_
Definition: GroupByAndAggregate.h:48

ExecutionKernel
Definition: ExecutionKernel.h:92

GroupByAndAggregate::emitCall
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
Definition: GroupByAndAggregate.cpp:2228

GroupByAndAggregate::row_set_mem_owner_
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: GroupByAndAggregate.h:213

BufferCompaction.h
Macros and functions for groupby buffer compaction.

GroupByAndAggregate::codegenVarlenOutputBuffer
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Definition: GroupByAndAggregate.cpp:1396

GroupByAndAggregate::codegenApproxQuantile
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: GroupByAndAggregate.cpp:1951

GroupByAndAggregate::checkErrorCode
void checkErrorCode(llvm::Value *retCode)
Definition: GroupByAndAggregate.cpp:2234

ApproxQuantileDescriptors
std::vector< ApproxQuantileDescriptor > ApproxQuantileDescriptors
Definition: ApproxQuantileDescriptor.h:26

TargetExprCodegenBuilder
Definition: TargetExprBuilder.h:81

RelAlgExecutionUnit::groupby_exprs
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
Definition: RelAlgExecutionUnit.h:171

GroupByAndAggregate::needsUnnestDoublePatch
bool needsUnnestDoublePatch(llvm::Value const *val_ptr, const std::string &agg_base_name, const bool threads_share_memory, const CompilationOptions &co) const
Definition: MaxwellCodegenPatch.cpp:29

GroupByAndAggregate::codegenMultiColumnBaselineHash
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
Definition: GroupByAndAggregate.cpp:1505

RuntimeFunctions.h

ExecutorDeviceType
ExecutorDeviceType
Definition: ExecutorDeviceType.h:23

GpuSharedMemoryContext
Definition: GpuSharedMemoryContext.h:20

ExecutorDeviceType::GPU

get_count_distinct_sub_bitmap_count
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: GroupByAndAggregate.h:227

ColRangeInfo::has_nulls
bool has_nulls
Definition: GroupByAndAggregate.h:52

DiamondCodegen.h

ColRangeInfo::isEmpty
bool isEmpty() const
Definition: GroupByAndAggregate.cpp:63

GroupByAndAggregate::GroupByAndAggregate
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
Definition: GroupByAndAggregate.cpp:394

ColRangeInfo::max
int64_t max
Definition: GroupByAndAggregate.h:50

RenderInfo
Definition: RenderInfo.h:31

GroupByAndAggregate::convertNullIfAny
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
Definition: GroupByAndAggregate.cpp:1594

Logger.h

GroupByAndAggregate::codegenSingleColumnPerfectHash
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
Definition: GroupByAndAggregate.cpp:1411

GroupByAndAggregate::codegenAggCalls
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
Definition: GroupByAndAggregate.cpp:1681

GroupByAndAggregate::codegenGroupBy
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
Definition: GroupByAndAggregate.cpp:1273

ColRangeInfo::min
int64_t min
Definition: GroupByAndAggregate.h:49

g_bigint_count
bool g_bigint_count
Definition: GroupByAndAggregate.cpp:54

GroupByAndAggregate::codegenCountDistinct
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
Definition: GroupByAndAggregate.cpp:1881

GroupByAndAggregate::prependForceSync
void prependForceSync()
Definition: MaxwellCodegenPatch.cpp:40

GroupByAndAggregate::codegenMultiColumnPerfectHash
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
Definition: GroupByAndAggregate.cpp:1461

GroupByAndAggregate::output_columnar_
bool output_columnar_
Definition: GroupByAndAggregate.h:214

GroupByAndAggregate::getShardedTopBucket
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
Definition: GroupByAndAggregate.cpp:425

KeylessInfo::target_index
const int32_t target_index
Definition: GroupByAndAggregate.h:58

GroupByAndAggregate::query_infos_
const std::vector< InputTableInfo > & query_infos_
Definition: GroupByAndAggregate.h:212

GroupByAndAggregate::executor_
Executor * executor_
Definition: GroupByAndAggregate.h:210

RenderInfo.h

GroupByAndAggregate::initQueryMemoryDescriptor
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
Definition: GroupByAndAggregate.cpp:853

GroupByAndAggregate::device_type_
const ExecutorDeviceType device_type_
Definition: GroupByAndAggregate.h:215

GroupByAndAggregate::codegenEstimator
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
Definition: GroupByAndAggregate.cpp:1817

GroupByAndAggregate::shard_count_for_top_groups
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
Definition: GroupByAndAggregate.cpp:2251

GroupByAndAggregate::codegenAggArg
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
Definition: GroupByAndAggregate.cpp:2044

GroupByAndAggregate::codegenPerfectHashFunction
llvm::Function * codegenPerfectHashFunction()
Definition: GroupByAndAggregate.cpp:1541

GroupByAndAggregate::codegenWindowRowPointer
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
Definition: GroupByAndAggregate.cpp:1645

GroupByAndAggregate::group_cardinality_estimation_
const std::optional< int64_t > group_cardinality_estimation_
Definition: GroupByAndAggregate.h:217

GpuMemUtils.h

InputMetadata.h

GroupByAndAggregate::codegenOutputSlot
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
Definition: GroupByAndAggregate.cpp:1185

SQLTypeInfo
Definition: sqltypes.h:332

QueryExecutionContext.h

TargetExprCodegen
Definition: TargetExprBuilder.h:33

CompilationOptions.h

KeylessInfo
Definition: GroupByAndAggregate.h:56

g_cluster
bool g_cluster

ColumnarResults.h

GroupByAndAggregate::ra_exe_unit_
const RelAlgExecutionUnit & ra_exe_unit_
Definition: GroupByAndAggregate.h:211

ColRangeInfo
Definition: GroupByAndAggregate.h:47

DiamondCodegen
Definition: DiamondCodegen.h:30

query_mem_desc
query_mem_desc
Definition: QueryMemoryInitializer.cpp:479

GroupByAndAggregate
Definition: GroupByAndAggregate.h:61

RelAlgExecutionUnit
Definition: RelAlgExecutionUnit.h:165