OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuSharedMemoryUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <llvm/Transforms/Utils/Cloning.h>
18 
19 #include "CodegenHelper.h"
20 #include "GpuSharedMemoryUtils.h"
21 #include "ResultSetReductionJIT.h"
22 #include "RuntimeFunctions.h"
23 
25  llvm::Module* llvm_module,
26  llvm::LLVMContext& context,
27  const QueryMemoryDescriptor& qmd,
28  const std::vector<TargetInfo>& targets,
29  const std::vector<int64_t>& init_agg_values,
30  const size_t executor_id)
31  : executor_id_(executor_id)
32  , module_(llvm_module)
33  , context_(context)
34  , reduction_func_(nullptr)
35  , init_func_(nullptr)
36  , query_mem_desc_(qmd)
37  , targets_(targets)
38  , init_agg_values_(init_agg_values) {
53 }
54 
56  auto timer = DEBUG_TIMER(__func__);
57 
58  // codegen the init function
63 
64  // codegen the reduction function:
69 }
70 
100  // adding names to input arguments:
101  auto arg_it = reduction_func_->arg_begin();
102  auto dest_buffer_ptr = &*arg_it;
103  dest_buffer_ptr->setName("dest_buffer_ptr");
104  arg_it++;
105  auto src_buffer_ptr = &*arg_it;
106  src_buffer_ptr->setName("src_buffer_ptr");
107  arg_it++;
108  auto buffer_size = &*arg_it;
109  buffer_size->setName("buffer_size");
110 
111  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", reduction_func_);
112  auto bb_body = llvm::BasicBlock::Create(context_, ".body", reduction_func_);
113  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", reduction_func_);
114  llvm::IRBuilder<> ir_builder(bb_entry);
115 
116  // synchronize all threads within a threadblock:
117  const auto sync_threadblock = getFunction("sync_threadblock");
118  ir_builder.CreateCall(sync_threadblock, {});
119 
120  const auto func_thread_index = getFunction("get_thread_index");
121  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
122 
123  // branching out of out of bound:
124  const auto entry_count = ll_int(query_mem_desc_.getEntryCount(), context_);
125  const auto entry_count_i32 =
126  ll_int(static_cast<int32_t>(query_mem_desc_.getEntryCount()), context_);
127  const auto is_thread_inbound =
128  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
129  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
130 
131  ir_builder.SetInsertPoint(bb_body);
132 
133  // cast src/dest buffers into byte streams:
134  auto src_byte_stream = ir_builder.CreatePointerCast(
135  src_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "src_byte_stream");
136  const auto dest_byte_stream = ir_builder.CreatePointerCast(
137  dest_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "dest_byte_stream");
138 
139  // running the result set reduction JIT code to get reduce_one_entry_idx function
140  auto fixup_query_mem_desc = ResultSet::fixupQueryMemoryDescriptor(query_mem_desc_);
141  auto rs_reduction_jit = std::make_unique<GpuReductionHelperJIT>(
142  fixup_query_mem_desc,
143  targets_,
145  executor_id_);
146  auto reduction_code = rs_reduction_jit->codegen();
147  CHECK(reduction_code.module);
148  reduction_code.module->setDataLayout(
149  "e-p:64:64:64-i1:8:8-i8:8:8-"
150  "i16:16:16-i32:32:32-i64:64:64-"
151  "f32:32:32-f64:64:64-v16:16:16-"
152  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
153  reduction_code.module->setTargetTriple("nvptx64-nvidia-cuda");
154  llvm::Linker linker(*module_);
155  std::unique_ptr<llvm::Module> owner(reduction_code.module);
156  bool link_error = linker.linkInModule(std::move(owner));
157  CHECK(!link_error);
158 
159  // go through the reduction code and replace all occurances of agg functions
160  // with their _shared counterparts, which are specifically used in GPUs
161  auto reduce_one_entry_func = getFunction("reduce_one_entry");
162  bool agg_func_found = true;
163  while (agg_func_found) {
164  agg_func_found = false;
165  for (auto it = llvm::inst_begin(reduce_one_entry_func);
166  it != llvm::inst_end(reduce_one_entry_func);
167  it++) {
168  if (!llvm::isa<llvm::CallInst>(*it)) {
169  continue;
170  }
171  auto& func_call = llvm::cast<llvm::CallInst>(*it);
172  auto const func_name = CodegenUtil::getCalledFunctionName(func_call);
173  if (func_name) {
174  std::string_view func_name_str = *func_name;
175  if (func_name_str.substr(0, 4) == "agg_") {
176  if (func_name_str.substr(func_name_str.length() - 7) == "_shared") {
177  continue;
178  }
179  agg_func_found = true;
180  std::vector<llvm::Value*> args;
181  args.reserve(func_call.getNumOperands());
182  for (size_t i = 0; i < func_call.getNumOperands() - 1; ++i) {
183  args.push_back(func_call.getArgOperand(i));
184  }
185  auto gpu_agg_func = getFunction(std::string(func_name_str) + "_shared");
186  llvm::ReplaceInstWithInst(&func_call,
187  llvm::CallInst::Create(gpu_agg_func, args, ""));
188  break;
189  }
190  }
191  }
192  }
193  const auto reduce_one_entry_idx_func = getFunction("reduce_one_entry_idx");
194  CHECK(reduce_one_entry_idx_func);
195 
196  // qmd_handles are only used with count distinct and baseline group by
197  // serialized varlen buffer is only used with SAMPLE on varlen types, which we will
198  // disable for current shared memory support.
199  const auto null_ptr_ll =
200  llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(context_, 0));
201  const auto thread_idx_i32 = ir_builder.CreateCast(
202  llvm::Instruction::CastOps::Trunc, thread_idx, get_int_type(32, context_));
203  ir_builder.CreateCall(reduce_one_entry_idx_func,
204  {dest_byte_stream,
205  src_byte_stream,
206  thread_idx_i32,
207  entry_count_i32,
208  null_ptr_ll,
209  null_ptr_ll,
210  null_ptr_ll},
211  "");
212  ir_builder.CreateBr(bb_exit);
213  llvm::ReturnInst::Create(context_, bb_exit);
214 }
215 
216 namespace {
217 // given a particular destination ptr to the beginning of an entry, this function creates
218 // proper cast for a specific slot index.
219 // it also assumes these pointers are within shared memory address space (3)
220 llvm::Value* codegen_smem_dest_slot_ptr(llvm::LLVMContext& context,
222  llvm::IRBuilder<>& ir_builder,
223  const size_t slot_idx,
224  const TargetInfo& target_info,
225  llvm::Value* dest_byte_stream,
226  llvm::Value* byte_offset) {
227  const auto sql_type = get_compact_type(target_info);
228  const auto slot_bytes = query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
229  auto ptr_type = [&context](const size_t slot_bytes, const SQLTypeInfo& sql_type) {
230  if (slot_bytes == sizeof(int32_t)) {
231  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
232  } else {
233  CHECK(slot_bytes == sizeof(int64_t));
234  return llvm::Type::getInt64PtrTy(context, /*address_space=*/3);
235  }
236  UNREACHABLE() << "Invalid slot size encountered: " << std::to_string(slot_bytes);
237  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
238  };
239 
240  const auto casted_dest_slot_address = ir_builder.CreatePointerCast(
241  ir_builder.CreateGEP(
242  dest_byte_stream->getType()->getScalarType()->getPointerElementType(),
243  dest_byte_stream,
244  byte_offset),
245  ptr_type(slot_bytes, sql_type),
246  "dest_slot_adr_" + std::to_string(slot_idx));
247  return casted_dest_slot_address;
248 }
249 } // namespace
250 
259  CHECK(init_func_);
260  // similar to the rest of the system, we used fixup QMD to be able to handle reductions
261  // it should be removed in the future.
262  auto fixup_query_mem_desc = ResultSet::fixupQueryMemoryDescriptor(query_mem_desc_);
263  CHECK(!fixup_query_mem_desc.didOutputColumnar());
264  CHECK(fixup_query_mem_desc.hasKeylessHash());
265  CHECK_GE(init_agg_values_.size(), targets_.size());
266 
267  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", init_func_);
268  auto bb_body = llvm::BasicBlock::Create(context_, ".body", init_func_);
269  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", init_func_);
270 
271  llvm::IRBuilder<> ir_builder(bb_entry);
272  const auto func_thread_index = getFunction("get_thread_index");
273  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
274 
275  // declare dynamic shared memory:
276  const auto declare_smem_func = getFunction("declare_dynamic_shared_memory");
277  const auto shared_mem_buffer =
278  ir_builder.CreateCall(declare_smem_func, {}, "shared_mem_buffer");
279 
280  const auto entry_count = ll_int(fixup_query_mem_desc.getEntryCount(), context_);
281  const auto is_thread_inbound =
282  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
283  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
284 
285  ir_builder.SetInsertPoint(bb_body);
286  // compute byte offset assigned to this thread:
287  const auto row_size_bytes = ll_int(fixup_query_mem_desc.getRowWidth(), context_);
288  auto byte_offset_ll = ir_builder.CreateMul(row_size_bytes, thread_idx, "byte_offset");
289 
290  const auto dest_byte_stream = ir_builder.CreatePointerCast(
291  shared_mem_buffer, llvm::Type::getInt8PtrTy(context_), "dest_byte_stream");
292 
293  // each thread will be responsible for one
294  const auto& col_slot_context = fixup_query_mem_desc.getColSlotContext();
295  size_t init_agg_idx = 0;
296  for (size_t target_logical_idx = 0; target_logical_idx < targets_.size();
297  ++target_logical_idx) {
298  const auto& target_info = targets_[target_logical_idx];
299  const auto& slots_for_target = col_slot_context.getSlotsForCol(target_logical_idx);
300  for (size_t slot_idx = slots_for_target.front(); slot_idx <= slots_for_target.back();
301  slot_idx++) {
302  const auto slot_size = fixup_query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
303 
304  auto casted_dest_slot_address = codegen_smem_dest_slot_ptr(context_,
305  fixup_query_mem_desc,
306  ir_builder,
307  slot_idx,
308  target_info,
309  dest_byte_stream,
310  byte_offset_ll);
311 
312  llvm::Value* init_value_ll = nullptr;
313  if (slot_size == sizeof(int32_t)) {
314  init_value_ll =
315  ll_int(static_cast<int32_t>(init_agg_values_[init_agg_idx++]), context_);
316  } else if (slot_size == sizeof(int64_t)) {
317  init_value_ll =
318  ll_int(static_cast<int64_t>(init_agg_values_[init_agg_idx++]), context_);
319  } else {
320  UNREACHABLE() << "Invalid slot size encountered.";
321  }
322  ir_builder.CreateStore(init_value_ll, casted_dest_slot_address);
323 
324  // if not the last loop, we compute the next offset:
325  if (slot_idx != (col_slot_context.getSlotCount() - 1)) {
326  byte_offset_ll = ir_builder.CreateAdd(
327  byte_offset_ll, ll_int(static_cast<size_t>(slot_size), context_));
328  }
329  }
330  }
331 
332  ir_builder.CreateBr(bb_exit);
333 
334  ir_builder.SetInsertPoint(bb_exit);
335  // synchronize all threads within a threadblock:
336  const auto sync_threadblock = getFunction("sync_threadblock");
337  ir_builder.CreateCall(sync_threadblock, {});
338  ir_builder.CreateRet(shared_mem_buffer);
339 }
340 
342  std::vector<llvm::Type*> input_arguments;
343  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
344  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
345  input_arguments.push_back(llvm::Type::getInt32Ty(context_));
346 
347  llvm::FunctionType* ft =
348  llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false);
349  const auto reduction_function = llvm::Function::Create(
350  ft, llvm::Function::ExternalLinkage, "reduce_from_smem_to_gmem", module_);
351  return reduction_function;
352 }
353 
355  std::vector<llvm::Type*> input_arguments;
356  input_arguments.push_back(
357  llvm::Type::getInt64PtrTy(context_)); // a pointer to the buffer
358  input_arguments.push_back(llvm::Type::getInt32Ty(context_)); // buffer size in bytes
359 
360  llvm::FunctionType* ft = llvm::FunctionType::get(
361  llvm::Type::getInt64PtrTy(context_), input_arguments, false);
362  const auto init_function = llvm::Function::Create(
363  ft, llvm::Function::ExternalLinkage, "init_smem_func", module_);
364  return init_function;
365 }
366 
367 llvm::Function* GpuSharedMemCodeBuilder::getFunction(const std::string& func_name) const {
368  const auto function = module_->getFunction(func_name);
369  CHECK(function) << func_name << " is not found in the module.";
370  return function;
371 }
372 
373 namespace {
379 void replace_called_function_with(llvm::Function* main_func,
380  const std::string& target_func_name,
381  llvm::Function* replace_func) {
382  for (auto it = llvm::inst_begin(main_func), e = llvm::inst_end(main_func); it != e;
383  ++it) {
384  if (!llvm::isa<llvm::CallInst>(*it)) {
385  continue;
386  }
387  auto& instruction = llvm::cast<llvm::CallInst>(*it);
388  auto const inst_func_name = CodegenUtil::getCalledFunctionName(instruction);
389  if (inst_func_name && *inst_func_name == target_func_name) {
390  std::vector<llvm::Value*> args;
391  for (size_t i = 0; i < instruction.getNumOperands() - 1; ++i) {
392  args.push_back(instruction.getArgOperand(i));
393  }
394  llvm::ReplaceInstWithInst(&instruction,
395  llvm::CallInst::Create(replace_func, args, ""));
396  return;
397  }
398  }
399  UNREACHABLE() << "Target function " << target_func_name << " was not found in "
400  << replace_func->getName().str();
401 }
402 
403 } // namespace
404 
405 void GpuSharedMemCodeBuilder::injectFunctionsInto(llvm::Function* query_func) {
407  CHECK(init_func_);
408  replace_called_function_with(query_func, "init_shared_mem", init_func_);
409  replace_called_function_with(query_func, "write_back_nop", reduction_func_);
410 }
411 
414  CHECK(init_func_);
416 }
GroupByPerfectHash
Definition: enums.h:58
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
const QueryMemoryDescriptor query_mem_desc_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
llvm::Function * createInitFunction() const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
GpuSharedMemCodeBuilder(llvm::Module *module, llvm::LLVMContext &context, const QueryMemoryDescriptor &qmd, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &init_agg_values, const size_t executor_id)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::string toString() const
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > initialize_target_values_for_storage(const std::vector< TargetInfo > &targets)
const std::vector< int64_t > init_agg_values_
llvm::Value * codegen_smem_dest_slot_ptr(llvm::LLVMContext &context, const QueryMemoryDescriptor &query_mem_desc, llvm::IRBuilder<> &ir_builder, const size_t slot_idx, const TargetInfo &target_info, llvm::Value *dest_byte_stream, llvm::Value *byte_offset)
std::string serialize_llvm_object(const T *llvm_obj)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
const std::vector< TargetInfo > targets_
void replace_called_function_with(llvm::Function *main_func, const std::string &target_func_name, llvm::Function *replace_func)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
__device__ void sync_threadblock()
llvm::Function * getFunction(const std::string &func_name) const
void injectFunctionsInto(llvm::Function *query_func)
llvm::Function * reduction_func_
llvm::Function * createReductionFunction() const