19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(
false,
"LLVM Version >= 9 is required.");
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
39 #include <llvm/Support/TargetRegistry.h>
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
81 using heavyai::ErrorCode;
87 #include <llvm/Support/DynamicLibrary.h>
90 extern std::unique_ptr<std::string> g_libgeos_so_filename;
92 static llvm::sys::DynamicLibrary geos_dynamic_library;
93 static std::mutex geos_init_mutex;
97 void load_geos_dynamic_library() {
98 std::lock_guard<std::mutex> guard(geos_init_mutex);
100 if (!geos_dynamic_library.isValid()) {
101 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
102 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
103 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
105 auto filename = *g_libgeos_so_filename;
106 std::string error_message;
107 geos_dynamic_library =
108 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
109 if (!geos_dynamic_library.isValid()) {
111 std::string exception_message =
"Failed to load GEOS library: " + error_message;
112 throw std::runtime_error(exception_message);
125 std::string src =
"",
126 const bool is_gpu =
false) {
127 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
128 llvm::raw_string_ostream ss(excname);
129 parse_error.print(src.c_str(), ss,
false,
false);
143 #define SHOW_DEFINED(MODULE) \
145 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
146 ::show_defined(MODULE); \
149 #define SHOW_FUNCTIONS(MODULE) \
151 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
152 ::show_functions(MODULE); \
155 template <
typename T =
void>
157 std::cout <<
"defines: ";
158 for (
auto&
f : llvm_module.getFunctionList()) {
159 if (!
f.isDeclaration()) {
160 std::cout <<
f.getName().str() <<
", ";
163 std::cout << std::endl;
166 template <
typename T =
void>
168 if (llvm_module ==
nullptr) {
169 std::cout <<
"is null" << std::endl;
175 template <
typename T =
void>
194 template <
typename T =
void>
196 std::unordered_set<std::string>& defined,
197 std::unordered_set<std::string>& undefined,
198 const std::unordered_set<std::string>& ignored) {
199 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
200 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
201 auto* F2 = CI->getCalledFunction();
203 auto F2name = F2->getName().str();
204 if (F2->isDeclaration()) {
205 if (F2name.rfind(
"__", 0) !=
207 && F2name.rfind(
"llvm.", 0) !=
209 && ignored.find(F2name) == ignored.end()
211 undefined.emplace(F2name);
214 if (defined.find(F2name) == defined.end()) {
215 defined.emplace(F2name);
216 scan_function_calls<T>(*F2, defined, undefined, ignored);
224 template <
typename T =
void>
226 std::unordered_set<std::string>& defined,
227 std::unordered_set<std::string>& undefined,
228 const std::unordered_set<std::string>& ignored) {
229 for (
auto& F : llvm_module) {
230 if (!F.isDeclaration()) {
236 template <
typename T =
void>
237 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
239 const std::unordered_set<std::string>& ignored = {}) {
240 std::unordered_set<std::string> defined, undefined;
242 return std::make_tuple(defined, undefined);
245 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
248 const std::unordered_set<llvm::Function*>& live_funcs) {
249 std::vector<llvm::Function*> dead_funcs;
252 if (live_funcs.count(&F)) {
255 for (
auto U : F.users()) {
256 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
257 if (!C || C->getParent()->getParent() != &F) {
263 dead_funcs.push_back(&F);
266 for (
auto pFn : dead_funcs) {
267 pFn->eraseFromParent();
275 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
277 for (llvm::Function& F : *llvm_module) {
278 if (F.hasName() && F.getName().startswith(
"__nv_")) {
279 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
283 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
288 void add_intrinsics_to_module(llvm::Module* llvm_module) {
289 for (llvm::Function& F : *llvm_module) {
290 for (llvm::Instruction& I : instructions(F)) {
291 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
292 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
293 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
294 llvm::Function& decl_fn =
295 *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
296 ii->setCalledFunction(&decl_fn);
299 llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
309 llvm::Module* llvm_module,
310 llvm::legacy::PassManager& pass_manager,
311 const std::unordered_set<llvm::Function*>& live_funcs,
312 const bool is_gpu_smem_used,
316 pass_manager.add(llvm::createVerifierPass());
317 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
321 pass_manager.add(llvm::createSROAPass());
325 llvm::createEarlyCSEPass(
true));
327 if (!is_gpu_smem_used) {
332 pass_manager.add(llvm::createJumpThreadingPass());
334 pass_manager.add(llvm::createCFGSimplificationPass());
337 pass_manager.add(llvm::createNewGVNPass());
339 pass_manager.add(llvm::createDeadStoreEliminationPass());
340 pass_manager.add(llvm::createLICMPass());
342 pass_manager.add(llvm::createInstructionCombiningPass());
345 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
346 pass_manager.add(llvm::createGlobalOptimizerPass());
348 pass_manager.add(llvm::createCFGSimplificationPass());
350 pass_manager.run(*llvm_module);
361 : execution_engine_(execution_engine) {}
365 : execution_engine_(execution_engine) {
368 #ifdef ENABLE_INTEL_JIT_LISTENER
372 LOG(
INFO) <<
"Registered IntelJITEventListener";
374 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
375 "listener configuration parameter.";
376 #endif // ENABLE_INTEL_JIT_LISTENER
382 llvm::ExecutionEngine* execution_engine) {
389 std::stringstream err_ss;
390 llvm::raw_os_ostream err_os(err_ss);
391 err_os <<
"\n-----\n";
392 if (llvm::verifyFunction(*func, &err_os)) {
393 err_os <<
"\n-----\n";
394 func->print(err_os,
nullptr);
395 err_os <<
"\n-----\n";
403 llvm::Module* llvm_module) {
404 llvm::legacy::PassManager pass_manager;
405 auto cpu_target_machine = execution_engine->getTargetMachine();
406 CHECK(cpu_target_machine);
407 llvm::SmallString<256> code_str;
408 llvm::raw_svector_ostream os(code_str);
409 #if LLVM_VERSION_MAJOR >= 10
410 cpu_target_machine->addPassesToEmitFile(
411 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
413 cpu_target_machine->addPassesToEmitFile(
414 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
416 pass_manager.run(*llvm_module);
417 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
421 llvm::EngineBuilder& eb,
425 CHECK(execution_engine.get());
427 llvm_module->setDataLayout(execution_engine->getDataLayout());
431 execution_engine->finalizeObject();
432 return execution_engine;
440 llvm::Function* func,
441 const std::unordered_set<llvm::Function*>& live_funcs,
444 llvm::Module* llvm_module = func->getParent();
447 #ifndef WITH_JIT_DEBUG
448 llvm::legacy::PassManager pass_manager;
450 func, llvm_module, pass_manager, live_funcs,
false, co);
451 #endif // WITH_JIT_DEBUG
464 auto init_err = llvm::InitializeNativeTarget();
467 llvm::InitializeAllTargetMCs();
468 llvm::InitializeNativeTargetAsmPrinter();
469 llvm::InitializeNativeTargetAsmParser();
472 std::unique_ptr<llvm::Module> owner(llvm_module);
474 llvm::EngineBuilder eb(std::move(owner));
475 eb.setErrorStr(&err_str);
476 eb.setEngineKind(llvm::EngineKind::JIT);
477 llvm::TargetOptions to;
478 to.EnableFastISel =
true;
479 eb.setTargetOptions(to);
488 llvm::Function* query_func,
489 llvm::Function* multifrag_query_func,
490 const std::unordered_set<llvm::Function*>& live_funcs,
495 llvm::Module* M = query_func->getParent();
496 auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
497 M->getModuleFlag(
"manage_memory_buffer"));
498 if (flag and flag->getZExtValue() == 1 and M->getFunction(
"allocate_varlen_buffer") and
499 M->getFunction(
"register_buffer_with_executor_rsm")) {
500 LOG(
INFO) <<
"including executor addr to cache key\n";
503 if (cgen_state_->filter_func_) {
506 for (
const auto helper : cgen_state_->helper_functions_) {
514 if (cgen_state_->needs_geos_) {
516 auto llvm_module = multifrag_query_func->getParent();
517 load_geos_dynamic_library();
520 auto rt_geos_module_copy = llvm::CloneModule(
521 *get_geos_module(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
522 auto func = llvm::dyn_cast<llvm::Function>(gv);
526 switch (func->getLinkage()) {
527 case llvm::GlobalValue::LinkageTypes::InternalLinkage:
528 case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
529 case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
530 case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
539 llvm::Linker::Flags::LinkOnlyNeeded);
541 throw std::runtime_error(
"GEOS is disabled in this build");
545 auto execution_engine =
547 auto cpu_compilation_context =
548 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
549 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
555 llvm::Module& llvm_module,
557 llvm::Linker::Flags flags) {
561 for (
auto&
f : *udf_module) {
562 auto func = llvm_module.getFunction(
f.getName());
564 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
565 << llvm_module.getModuleIdentifier() <<
" from `"
566 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
567 throw std::runtime_error(
568 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
571 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
572 << llvm_module.getModuleIdentifier() <<
" from `"
573 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
577 auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->
vmap_);
579 udf_module_copy->setDataLayout(llvm_module.getDataLayout());
580 udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
583 llvm::Linker ld(llvm_module);
584 bool link_error =
false;
586 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
589 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
599 if (s ==
"int16_t") {
602 if (s ==
"int32_t") {
605 if (s ==
"int64_t") {
608 CHECK(s ==
"float" || s ==
"double");
614 for (
const std::string any_or_all : {
"any",
"all"}) {
615 for (
const std::string elem_type :
616 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
617 for (
const std::string needle_type :
618 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
619 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
620 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
632 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
634 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
635 key_llvm_type +
", i64);\n";
641 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare double @llvm.fmuladd.f64(double, double, double)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64, i32);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_count_if_shared(i64*, i64);
declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_if_int32_shared(i32*, i32);
declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_if_shared(i64*, i64, i8);
declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8);
declare i32 @agg_sum_if_int32_shared(i32*, i32, i8);
declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8);
declare void @agg_sum_if_double_shared(i64*, double, i8);
declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8);
declare void @agg_sum_if_float_shared(i32*, float, i8);
declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @ExtractTimeFromHPTimestamp(i64,i64);
declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64);
declare i64 @ExtractTimeFromLPTimestamp(i64);
declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare {i8*,i64} @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @array_size_1_nullable(i8*, i64, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i64 @determine_fixed_array_len(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare double @width_bucket(double, double, double, double, i32);
declare double @width_bucket_reverse(double, double, double, double, i32);
declare double @width_bucket_nullable(double, double, double, double, i32, double);
declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double);
declare double @width_bucket_no_oob_check(double, double, double);
declare double @width_bucket_reverse_no_oob_check(double, double, double);
declare double @width_bucket_expr(double, i1, double, double, i32);
declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double);
declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32, i8);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
declare double @decompress_x_coord_geoint(i32);
declare double @decompress_y_coord_geoint(i32);
declare i32 @compress_x_coord_geoint(double);
declare i32 @compress_y_coord_geoint(double);
declare i64 @fixed_width_date_encode(i64, i32, i64);
declare i64 @fixed_width_date_decode(i64, i32, i64);
)" + gen_array_any_all_sigs() +
647 bool check_any_operand_is_stacksave_intrinsic(llvm::Instruction& inst) {
648 for (
auto op_it = inst.op_begin(); op_it != inst.op_end(); op_it++) {
649 if (
const llvm::IntrinsicInst* inst2 = llvm::dyn_cast<llvm::IntrinsicInst>(*op_it)) {
650 if (inst2->getIntrinsicID() == llvm::Intrinsic::stacksave) {
659 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
665 void legalize_nvvm_ir(llvm::Function* query_func) {
672 std::vector<llvm::Instruction*> stackrestore_intrinsics;
673 std::vector<llvm::Instruction*> stacksave_intrinsics;
674 std::vector<llvm::Instruction*> lifetime;
675 for (
auto& BB : *query_func) {
676 for (llvm::Instruction& I : BB) {
677 if (llvm::dyn_cast<llvm::PHINode>(&I)) {
678 if (check_any_operand_is_stacksave_intrinsic(I)) {
681 stacksave_intrinsics.push_back(&I);
682 VLOG(2) <<
"Remove PHI node having llvm::stacksave intrinsic as its operand";
684 }
else if (
const llvm::IntrinsicInst* II =
685 llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
686 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
687 stacksave_intrinsics.push_back(&I);
688 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
689 stackrestore_intrinsics.push_back(&I);
690 }
else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
691 II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
692 lifetime.push_back(&I);
701 for (
auto& II : stackrestore_intrinsics) {
702 II->eraseFromParent();
704 for (
auto& II : stacksave_intrinsics) {
705 II->eraseFromParent();
708 for (
auto& II : lifetime) {
709 II->eraseFromParent();
717 return llvm::StringRef(
"nvptx64-nvidia-cuda");
721 return llvm::StringRef(
722 "e-p:64:64:64-i1:8:8-i8:8:8-"
723 "i16:16:16-i32:32:32-i64:64:64-"
724 "f32:32:32-f64:64:64-v16:16:16-"
725 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
729 std::map<std::string, std::string>
result;
731 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
732 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
734 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
738 std::string sizeof_types;
741 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
743 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
745 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
747 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
749 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
750 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
751 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
754 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
757 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
759 std::string null_values;
760 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
761 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
762 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
763 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
764 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
765 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
766 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
767 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
768 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
769 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
770 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
771 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
773 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
775 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
777 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
779 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
781 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
783 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
785 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
787 result.insert(std::make_pair(
"null_values", null_values));
789 llvm::StringMap<bool> cpu_features;
790 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
791 std::string features_str =
"";
792 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
793 features_str += (it->getValue() ?
" +" :
" -");
794 features_str += it->getKey().str();
796 result.insert(std::make_pair(
"cpu_features", features_str));
799 result.insert(std::make_pair(
"llvm_version",
806 int device_count = 0;
810 char device_name[256];
811 int major = 0, minor = 0;
816 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
818 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
821 result.insert(std::make_pair(
"gpu_name", device_name));
822 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
823 result.insert(std::make_pair(
"gpu_compute_capability",
827 result.insert(std::make_pair(
"gpu_driver",
833 std::make_pair(
"gpu_has_libdevice",
845 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
846 llvm::Module& llvm_module,
847 const std::vector<llvm::Function*>& roots) {
848 std::queue<llvm::Function*> queue;
849 std::unordered_set<llvm::Function*> visited;
850 for (llvm::Function* F : roots) {
854 while (!queue.empty()) {
855 llvm::Function* F = queue.front();
857 if (visited.find(F) != visited.end()) {
862 for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
863 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
864 if (CI->isInlineAsm()) {
867 llvm::Function* called = CI->getCalledFunction();
868 if (!called || visited.find(called) != visited.end()) {
883 llvm::Module& llvm_module,
884 llvm::PassManagerBuilder& pass_manager_builder,
885 const GPUTarget& gpu_target) {
889 if (!executor->has_libdevice_module()) {
891 throw std::runtime_error(
892 "libdevice library is not available but required by the UDF module");
896 std::vector<llvm::Function*> roots;
897 for (llvm::Function& fn : llvm_module) {
898 if (!fn.isDeclaration()) {
899 roots.emplace_back(&fn);
906 gpu_target.cgen_state,
907 llvm::Linker::Flags::OverrideFromSrc);
909 std::unordered_set<llvm::Function*> live_funcs =
910 findAliveRuntimeFuncs(llvm_module, roots);
912 std::vector<llvm::Function*> funcs_to_delete;
913 for (llvm::Function& fn : llvm_module) {
914 if (!live_funcs.count(&fn)) {
916 funcs_to_delete.emplace_back(&fn);
920 for (llvm::Function*
f : funcs_to_delete) {
921 f->eraseFromParent();
925 #if LLVM_VERSION_MAJOR >= 11
926 llvm::LLVMContext& ctx = llvm_module.getContext();
927 llvm_module.setModuleFlag(llvm::Module::Override,
929 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
930 llvm::Type::getInt32Ty(ctx), uint32_t(1))));
932 llvm_module.addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", uint32_t(1));
934 for (llvm::Function& fn : llvm_module) {
935 fn.addFnAttr(
"nvptx-f32ftz",
"true");
939 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
940 llvm::legacy::FunctionPassManager FPM(&llvm_module);
941 pass_manager_builder.populateFunctionPassManager(FPM);
944 FPM.doInitialization();
945 for (
auto& F : llvm_module) {
948 FPM.doFinalization();
954 llvm::Function* func,
955 llvm::Function* wrapper_func,
956 const std::unordered_set<llvm::Function*>& live_funcs,
957 const bool is_gpu_smem_used,
959 const GPUTarget& gpu_target) {
962 auto llvm_module = func->getParent();
983 CHECK(gpu_target.cgen_state->module_ == llvm_module);
984 CHECK(func->getParent() == wrapper_func->getParent());
985 llvm_module->setDataLayout(
986 "e-p:64:64:64-i1:8:8-i8:8:8-"
987 "i16:16:16-i32:32:32-i64:64:64-"
988 "f32:32:32-f64:64:64-v16:16:16-"
989 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
990 llvm_module->setTargetTriple(
"nvptx64-nvidia-cuda");
991 CHECK(gpu_target.nvptx_target_machine);
992 llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
994 pass_manager_builder.OptLevel = 0;
995 llvm::legacy::PassManager module_pass_manager;
996 pass_manager_builder.populateModulePassManager(module_pass_manager);
998 bool requires_libdevice = check_module_requires_libdevice(llvm_module);
1000 if (requires_libdevice) {
1005 optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
1006 legalize_nvvm_ir(func);
1008 std::stringstream ss;
1009 llvm::raw_os_ostream os(ss);
1011 llvm::LLVMContext& ctx = llvm_module->getContext();
1013 llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata(
"nvvm.annotations");
1015 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
1016 llvm::MDString::get(ctx,
"kernel"),
1017 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1018 llvm::Type::getInt32Ty(ctx), 1))};
1021 md->addOperand(llvm::MDNode::get(ctx, md_vals));
1023 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
1024 if (gpu_target.row_func_not_inlined) {
1026 roots.insert(gpu_target.cgen_state->row_func_);
1027 if (gpu_target.cgen_state->filter_func_) {
1028 roots.insert(gpu_target.cgen_state->filter_func_);
1033 for (
auto f : gpu_target.cgen_state->helper_functions_) {
1037 if (requires_libdevice) {
1038 for (llvm::Function& F : *llvm_module) {
1046 if (F.hasName() && F.getName().startswith(
"__internal") && !F.isDeclaration()) {
1049 legalize_nvvm_ir(&F);
1054 std::unordered_set<std::string> udf_declarations;
1056 if (executor->has_udf_module(
true)) {
1057 for (
auto&
f : executor->get_udf_module(
true)->getFunctionList()) {
1058 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1061 legalize_nvvm_ir(udf_function);
1062 roots.insert(udf_function);
1066 if (
f.isDeclaration()) {
1067 udf_declarations.insert(
f.getName().str());
1073 if (executor->has_rt_udf_module(
true)) {
1074 for (
auto&
f : executor->get_rt_udf_module(
true)->getFunctionList()) {
1075 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1077 legalize_nvvm_ir(udf_function);
1078 roots.insert(udf_function);
1082 if (
f.isDeclaration()) {
1083 udf_declarations.insert(
f.getName().str());
1089 std::vector<llvm::Function*> rt_funcs;
1090 for (
auto& Fn : *llvm_module) {
1091 if (roots.count(&Fn)) {
1094 rt_funcs.push_back(&Fn);
1096 for (
auto& pFn : rt_funcs) {
1097 pFn->removeFromParent();
1100 if (requires_libdevice) {
1101 add_intrinsics_to_module(llvm_module);
1104 if (!llvm_module->getModuleFlag(
"Debug Info Version")) {
1106 llvm_module->addModuleFlag(
1107 llvm::Module::Error,
"Debug Info Version", llvm::DEBUG_METADATA_VERSION);
1110 llvm_module->print(os,
nullptr);
1113 for (
auto& pFn : rt_funcs) {
1114 llvm_module->getFunctionList().push_back(pFn);
1116 llvm_module->eraseNamedMetadata(md);
1118 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1122 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1124 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
1125 <<
". Switching to CPU execution target.";
1128 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
1131 auto func_name = wrapper_func->getName().str();
1132 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1133 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1135 gpu_compilation_context->addDeviceCode(
1136 std::make_unique<GpuDeviceCompilationContext>(cubin_result.
cubin,
1140 gpu_target.cuda_mgr,
1147 return gpu_compilation_context;
1154 llvm::Function* query_func,
1155 llvm::Function* multifrag_query_func,
1156 std::unordered_set<llvm::Function*>& live_funcs,
1157 const bool no_inline,
1159 const bool is_gpu_smem_used,
1167 if (cgen_state_->filter_func_) {
1170 for (
const auto helper : cgen_state_->helper_functions_) {
1178 bool row_func_not_inlined =
false;
1180 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
1181 e = llvm::inst_end(cgen_state_->row_func_);
1184 if (llvm::isa<llvm::CallInst>(*it)) {
1185 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1188 (*func_name ==
"array_size" || *func_name ==
"linear_probabilistic_count")) {
1190 row_func_not_inlined =
true;
1197 initializeNVPTXBackend();
1199 nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1200 std::shared_ptr<GpuCompilationContext> compilation_context;
1205 multifrag_query_func,
1210 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1211 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1215 auto const num_entries_to_evict =
1217 code_cache_accessor->evictEntries(num_entries_to_evict);
1220 multifrag_query_func,
1237 llvm::TargetMachine* nvptx_target_machine,
1238 llvm::LLVMContext& context) {
1240 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1242 llvm::SMDiagnostic parse_error;
1244 auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1246 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1250 llvm::SmallString<256> code_str;
1251 llvm::raw_svector_ostream formatted_os(code_str);
1252 CHECK(nvptx_target_machine);
1254 llvm::legacy::PassManager ptxgen_pm;
1255 llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1257 #if LLVM_VERSION_MAJOR >= 10
1258 nvptx_target_machine->addPassesToEmitFile(
1259 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1261 nvptx_target_machine->addPassesToEmitFile(
1262 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1264 ptxgen_pm.run(*llvm_module);
1267 #if LLVM_VERSION_MAJOR >= 11
1268 return std::string(code_str);
1270 return code_str.str();
1282 llvm::InitializeAllTargets();
1283 llvm::InitializeAllTargetMCs();
1284 llvm::InitializeAllAsmPrinters();
1286 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1290 return std::unique_ptr<llvm::TargetMachine>(
1291 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1294 llvm::TargetOptions(),
1295 llvm::Reloc::Static));
1300 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1304 if (nvptx_target_machine_) {
1307 const auto arch = cudaMgr()->getDeviceArch();
1314 {
"query_stub_hoisted_literals",
1315 "multifrag_query_hoisted_literals",
1318 "fixed_width_int_decode",
1319 "fixed_width_unsigned_decode",
1320 "diff_fixed_width_int_decode",
1321 "fixed_width_double_decode",
1322 "fixed_width_float_decode",
1323 "fixed_width_small_date_decode",
1324 "record_error_code",
1328 "group_buff_idx_impl",
1330 "init_shared_mem_nop",
1333 auto const candidate_func_name = func->getName().str();
1336 [candidate_func_name](std::string_view func_name) {
1337 return candidate_func_name == func_name;
1342 const std::string& bc_filename,
1343 llvm::LLVMContext& context) {
1344 llvm::SMDiagnostic err;
1346 auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1347 CHECK(!buffer_or_error.getError()) <<
"bc_filename=" << bc_filename;
1348 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1350 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1351 CHECK(!owner.takeError());
1352 CHECK(owner->get());
1353 return std::move(owner.get());
1357 const std::string& udf_ir_filename,
1358 llvm::LLVMContext& ctx,
1359 bool is_gpu =
false) {
1360 llvm::SMDiagnostic parse_error;
1362 llvm::StringRef file_name_arg(udf_ir_filename);
1364 auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1370 llvm::Triple gpu_triple(owner->getTargetTriple());
1371 if (!gpu_triple.isNVPTX()) {
1373 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1374 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1375 return std::unique_ptr<llvm::Module>();
1382 const std::string& udf_ir_string,
1383 llvm::LLVMContext& ctx,
1384 bool is_gpu =
false) {
1385 llvm::SMDiagnostic parse_error;
1387 auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1388 "Runtime UDF/UDTF LLVM/NVVM IR");
1390 auto owner = llvm::parseIR(*buf, parse_error, ctx);
1392 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1393 << udf_ir_string <<
"\nEnd of LLVM/NVVM IR";
1398 llvm::Triple gpu_triple(owner->getTargetTriple());
1399 if (!gpu_triple.isNVPTX()) {
1400 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1401 << udf_ir_string <<
"\nEnd of NNVM IR";
1402 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1404 <<
". Executing runtime UDF/UDTFs on GPU will be disabled.";
1405 return std::unique_ptr<llvm::Module>();
1415 const bool use_resume_param,
1416 llvm::Function* query_func,
1417 llvm::Module* llvm_module) {
1418 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1420 if (!llvm::isa<llvm::CallInst>(*it)) {
1423 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1425 if (func_name && *func_name == pos_fn_name) {
1426 if (use_resume_param) {
1427 auto*
const row_index_resume =
get_arg_by_name(query_func,
"row_index_resume");
1428 llvm::ReplaceInstWithInst(
1430 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl"),
1433 llvm::ReplaceInstWithInst(
1435 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl")));
1443 const size_t in_col_count,
1444 const size_t agg_col_count,
1445 const bool hoist_literals) {
1446 auto arg_it = row_func->arg_begin();
1448 if (agg_col_count) {
1449 for (
size_t i = 0; i < agg_col_count; ++i) {
1450 arg_it->setName(
"out");
1454 arg_it->setName(
"group_by_buff");
1456 arg_it->setName(
"varlen_output_buff");
1458 arg_it->setName(
"crt_matched");
1460 arg_it->setName(
"total_matched");
1462 arg_it->setName(
"old_total_matched");
1464 arg_it->setName(
"max_matched");
1468 arg_it->setName(
"agg_init_val");
1471 arg_it->setName(
"pos");
1474 arg_it->setName(
"frag_row_off");
1477 arg_it->setName(
"num_rows_per_scan");
1480 if (hoist_literals) {
1481 arg_it->setName(
"literals");
1485 for (
size_t i = 0; i < in_col_count; ++i) {
1490 arg_it->setName(
"join_hash_tables");
1492 arg_it->setName(
"row_func_mgr");
1496 const size_t agg_col_count,
1497 const bool hoist_literals,
1498 llvm::Module* llvm_module,
1499 llvm::LLVMContext& context) {
1500 std::vector<llvm::Type*> row_process_arg_types;
1502 if (agg_col_count) {
1504 for (
size_t i = 0; i < agg_col_count; ++i) {
1505 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1509 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1511 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1513 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1515 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1517 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1519 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1523 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1526 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1529 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1532 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1535 if (hoist_literals) {
1536 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1540 for (
size_t i = 0; i < in_col_count; ++i) {
1541 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1545 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1548 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1552 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1554 auto row_func = llvm::Function::Create(
1555 ft, llvm::Function::ExternalLinkage,
"row_func", llvm_module);
1565 const std::string& query_fname,
1566 llvm::Function* multifrag_query_func,
1567 llvm::Module* llvm_module) {
1568 std::vector<llvm::CallInst*> query_stubs;
1569 for (
auto it = llvm::inst_begin(multifrag_query_func),
1570 e = llvm::inst_end(multifrag_query_func);
1573 if (!llvm::isa<llvm::CallInst>(*it)) {
1576 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1578 if (call_func_name && *call_func_name == query_fname) {
1579 query_stubs.push_back(&query_call);
1582 for (
auto& S : query_stubs) {
1584 for (
size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1585 args.push_back(S->getArgOperand(i));
1587 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1591 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1592 const bool is_group_by) {
1593 std::vector<std::string>
result;
1594 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1595 ++target_idx, ++agg_col_idx) {
1596 const auto target_expr = target_exprs[target_idx];
1598 const auto target_type_info = target_expr->get_type_info();
1600 const bool is_varlen =
1601 (target_type_info.is_string() &&
1603 target_type_info.is_array();
1604 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1605 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1607 result.emplace_back(
"agg_id");
1609 if (target_type_info.is_geometry()) {
1610 result.emplace_back(
"agg_id");
1611 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1612 result.emplace_back(
"agg_id");
1622 agg_type_info = target_type_info;
1625 agg_type_info = agg_expr->get_arg()->get_type_info();
1631 !agg_type_info.
is_fp()) {
1632 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1636 :
"agg_sum_double");
1639 :
"agg_count_double");
1645 throw std::runtime_error(
1646 "MIN on strings, arrays or geospatial types not supported yet");
1650 :
"agg_min_double");
1656 throw std::runtime_error(
1657 "MAX on strings, arrays or geospatial types not supported yet");
1661 :
"agg_max_double");
1667 !agg_type_info.
is_fp()) {
1668 throw std::runtime_error(
1669 "SUM and SUM_IF is only valid on integer and floating point");
1677 result.emplace_back(func_name);
1681 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1685 result.emplace_back(
"agg_count_if");
1688 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1693 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1697 result.emplace_back(
"agg_approximate_count_distinct");
1700 result.emplace_back(
"agg_approx_quantile");
1703 result.emplace_back(
"agg_mode_func");
1706 UNREACHABLE() <<
"Usupported agg_type: " << agg_type;
1715 const bool is_cuda_ir) {
1723 llvm::Module& llvm_module,
1724 const std::vector<llvm::Function*>& roots,
1725 const std::vector<llvm::Function*>& leaves) {
1727 std::unordered_set<llvm::Function*> live_funcs;
1728 live_funcs.insert(roots.begin(), roots.end());
1729 live_funcs.insert(leaves.begin(), leaves.end());
1731 if (
auto F = llvm_module.getFunction(
"init_shared_mem_nop")) {
1732 live_funcs.insert(F);
1734 if (
auto F = llvm_module.getFunction(
"write_back_nop")) {
1735 live_funcs.insert(F);
1738 for (
const llvm::Function* F : roots) {
1739 for (
const llvm::BasicBlock& BB : *F) {
1740 for (
const llvm::Instruction& I : BB) {
1741 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1742 live_funcs.insert(CI->getCalledFunction());
1748 for (llvm::Function& F : llvm_module) {
1749 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1750 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1760 template <
typename InstType>
1762 std::string bb_name,
1763 std::string variable_name) {
1764 llvm::Value* result =
nullptr;
1765 if (func ==
nullptr || variable_name.empty()) {
1768 bool is_found =
false;
1769 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1770 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1773 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1774 if (llvm::isa<InstType>(*inst_it)) {
1775 if (inst_it->getName() == variable_name) {
1788 llvm::Function* query_func,
1789 bool run_with_dynamic_watchdog,
1790 bool run_with_allowing_runtime_interrupt,
1791 const std::vector<JoinLoop>& join_loops,
1793 const std::vector<InputTableInfo>& input_table_infos) {
1799 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1802 run_with_allowing_runtime_interrupt =
false;
1808 executor_session_mutex_);
1809 if (current_query_session_.empty()) {
1810 run_with_allowing_runtime_interrupt =
false;
1814 llvm::Value* row_count =
nullptr;
1815 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1818 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1821 bool done_splitting =
false;
1822 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1824 llvm::Value* pos =
nullptr;
1825 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1826 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1827 llvm::isa<llvm::PHINode>(*inst_it)) {
1828 if (inst_it->getName() ==
"pos") {
1833 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1836 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1838 if (row_func_name && *row_func_name ==
"row_process") {
1839 auto next_inst_it = inst_it;
1841 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1842 auto& br_instr = bb_it->back();
1843 llvm::IRBuilder<> ir_builder(&br_instr);
1844 llvm::Value* err_lv = &*inst_it;
1845 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1846 if (run_with_dynamic_watchdog) {
1848 llvm::Value* call_watchdog_lv =
nullptr;
1854 auto crit_edge_rem =
1855 (blockSize() & (blockSize() - 1))
1856 ? ir_builder.CreateSRem(
1858 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1859 : ir_builder.CreateAnd(
1861 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1862 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1863 crit_edge_threshold->setName(
"crit_edge_threshold");
1868 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1871 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1872 call_watchdog_lv = ir_builder.CreateICmp(
1873 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1875 CHECK(call_watchdog_lv);
1876 auto error_check_bb = bb_it->splitBasicBlock(
1877 llvm::BasicBlock::iterator(br_instr),
".error_check");
1878 auto& watchdog_br_instr = bb_it->back();
1880 auto watchdog_check_bb = llvm::BasicBlock::Create(
1881 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1882 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1883 auto detected_timeout = watchdog_ir_builder.CreateCall(
1884 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1885 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1887 cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)),
1889 watchdog_ir_builder.CreateBr(error_check_bb);
1891 llvm::ReplaceInstWithInst(
1893 llvm::BranchInst::Create(
1894 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1895 ir_builder.SetInsertPoint(&br_instr);
1896 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1898 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1899 unified_err_lv->addIncoming(err_lv, &*bb_it);
1900 err_lv = unified_err_lv;
1901 }
else if (run_with_allowing_runtime_interrupt) {
1903 llvm::Value* call_check_interrupt_lv{
nullptr};
1904 llvm::Value* interrupt_err_lv{
nullptr};
1905 llvm::BasicBlock* error_check_bb{
nullptr};
1906 llvm::BasicBlock* interrupt_check_bb{
nullptr};
1907 llvm::Instruction* check_interrupt_br_instr{
nullptr};
1910 join_loops.begin(), join_loops.end(), [](
const JoinLoop& join_loop) {
1911 return join_loop.isNestedLoopJoin();
1913 auto codegen_interrupt_checker = [&]() {
1914 error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1916 check_interrupt_br_instr = &bb_it->back();
1918 interrupt_check_bb = llvm::BasicBlock::Create(
1919 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1920 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1921 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1922 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1923 interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1925 cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
1927 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1929 if (has_loop_join) {
1930 codegen_interrupt_checker();
1931 CHECK(interrupt_check_bb);
1932 CHECK(check_interrupt_br_instr);
1933 llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1934 llvm::BranchInst::Create(interrupt_check_bb));
1935 ir_builder.SetInsertPoint(&br_instr);
1936 err_lv = interrupt_err_lv;
1948 int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1949 uint64_t interrupt_checking_freq = 32;
1953 if (!input_table_infos.empty()) {
1954 const auto& outer_table_info = *input_table_infos.begin();
1955 auto num_outer_table_tuples =
1956 outer_table_info.info.getFragmentNumTuplesUpperBound();
1957 if (num_outer_table_tuples > 0) {
1965 auto max_inc = uint64_t(
1966 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1972 auto calibrated_inc =
1973 uint64_t(floor(max_inc * (1 - freq_control_knob)));
1974 interrupt_checking_freq =
1979 if (interrupt_checking_freq > max_inc) {
1980 interrupt_checking_freq = max_inc / 2;
1982 if (interrupt_checking_freq < 8) {
1985 interrupt_checking_freq = 8;
1989 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1990 << interrupt_checking_freq;
1992 llvm::Value* pos_shifted_per_iteration =
1993 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1994 auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1995 interrupt_checking_freq);
1996 call_check_interrupt_lv =
1997 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1998 interrupt_predicate,
1999 cgen_state_->llInt(int64_t(0LL)));
2002 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2003 call_check_interrupt_lv =
2004 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2005 interrupt_predicate,
2006 cgen_state_->llInt(int64_t(0LL)));
2008 codegen_interrupt_checker();
2009 CHECK(call_check_interrupt_lv);
2010 CHECK(interrupt_err_lv);
2011 CHECK(interrupt_check_bb);
2012 CHECK(error_check_bb);
2013 CHECK(check_interrupt_br_instr);
2014 llvm::ReplaceInstWithInst(
2015 check_interrupt_br_instr,
2016 llvm::BranchInst::Create(
2017 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2018 ir_builder.SetInsertPoint(&br_instr);
2019 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2021 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2022 unified_err_lv->addIncoming(err_lv, &*bb_it);
2023 err_lv = unified_err_lv;
2026 if (!err_lv_returned_from_row_func) {
2027 err_lv_returned_from_row_func = err_lv;
2034 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2036 cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)));
2038 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2040 cgen_state_->llInt(static_cast<int32_t>(0)));
2042 auto error_bb = llvm::BasicBlock::Create(
2043 cgen_state_->context_,
".error_exit", query_func, new_bb);
2044 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
2045 llvm::CallInst::Create(
2046 cgen_state_->module_->getFunction(
"record_error_code"),
2047 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2050 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2051 llvm::ReplaceInstWithInst(&br_instr,
2052 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2053 done_splitting =
true;
2058 CHECK(done_splitting);
2062 llvm::Module* M = cgen_state_->module_;
2063 if (M->getFunction(
"allocate_varlen_buffer") ==
nullptr) {
2068 bool should_track =
false;
2069 auto* flag = M->getModuleFlag(
"manage_memory_buffer");
2070 if (
auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2071 if (cnt->getZExtValue() == 1) {
2072 should_track =
true;
2076 if (!should_track) {
2081 LOG(
INFO) <<
"Found 'manage_memory_buffer' metadata.";
2082 llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2084 for (llvm::Function& F : *M) {
2085 for (llvm::BasicBlock& BB : F) {
2086 for (llvm::Instruction& I : BB) {
2087 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2090 if (called_func_name && *called_func_name ==
"allocate_varlen_buffer") {
2091 calls_to_analyze.push_back(CI);
2100 llvm::IRBuilder<> Builder(cgen_state_->context_);
2103 auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2104 llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p},
false);
2105 llvm::FunctionCallee register_buffer_fn =
2106 M->getOrInsertFunction(
"register_buffer_with_executor_rsm", fnty, {});
2108 int64_t executor_addr =
reinterpret_cast<int64_t
>(
this);
2109 for (llvm::CallInst* CI : calls_to_analyze) {
2114 for (llvm::User* U : CI->users()) {
2115 if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2117 if (func_name && *func_name ==
"register_buffer_with_executor_rsm") {
2124 Builder.SetInsertPoint(CI->getNextNode());
2125 Builder.CreateCall(register_buffer_fn,
2126 {
ll_int(executor_addr, cgen_state_->context_), CI});
2134 std::vector<llvm::Value*> hoisted_literals;
2138 std::vector<llvm::Type*> row_process_arg_types;
2140 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2141 E = cgen_state_->row_func_->arg_end();
2144 row_process_arg_types.push_back(I->getType());
2147 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2148 for (
auto value : element.second) {
2149 row_process_arg_types.push_back(value->getType());
2153 auto ft = llvm::FunctionType::get(
2154 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
2155 auto row_func_with_hoisted_literals =
2156 llvm::Function::Create(ft,
2157 llvm::Function::ExternalLinkage,
2158 "row_func_hoisted_literals",
2159 cgen_state_->row_func_->getParent());
2161 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2162 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2163 E = cgen_state_->row_func_->arg_end();
2167 row_func_arg_it->setName(I->getName());
2172 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
2173 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
2174 if (cgen_state_->filter_func_) {
2177 std::vector<llvm::Type*> filter_func_arg_types;
2179 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2180 E = cgen_state_->filter_func_->arg_end();
2183 filter_func_arg_types.push_back(I->getType());
2186 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2187 for (
auto value : element.second) {
2188 filter_func_arg_types.push_back(value->getType());
2192 auto ft2 = llvm::FunctionType::get(
2193 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
2194 filter_func_with_hoisted_literals =
2195 llvm::Function::Create(ft2,
2196 llvm::Function::ExternalLinkage,
2197 "filter_func_hoisted_literals",
2198 cgen_state_->filter_func_->getParent());
2200 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2201 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2202 E = cgen_state_->filter_func_->arg_end();
2206 filter_func_arg_it->setName(I->getName());
2208 ++filter_func_arg_it;
2212 std::unordered_map<int, std::vector<llvm::Value*>>
2213 query_func_literal_loads_function_arguments,
2214 query_func_literal_loads_function_arguments2;
2216 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2217 std::vector<llvm::Value*> argument_values, argument_values2;
2219 for (
auto value : element.second) {
2220 hoisted_literals.push_back(value);
2221 argument_values.push_back(&*row_func_arg_it);
2222 if (cgen_state_->filter_func_) {
2223 argument_values2.push_back(&*filter_func_arg_it);
2224 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2226 if (value->hasName()) {
2227 row_func_arg_it->setName(
"arg_" + value->getName());
2228 if (cgen_state_->filter_func_) {
2229 filter_func_arg_it->getContext();
2230 filter_func_arg_it->setName(
"arg_" + value->getName());
2234 ++filter_func_arg_it;
2237 query_func_literal_loads_function_arguments[element.first] = argument_values;
2238 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2244 row_func_with_hoisted_literals->getBasicBlockList().splice(
2245 row_func_with_hoisted_literals->begin(),
2246 cgen_state_->row_func_->getBasicBlockList());
2249 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2250 E = cgen_state_->row_func_->arg_end(),
2251 I2 = row_func_with_hoisted_literals->arg_begin();
2254 I->replaceAllUsesWith(&*I2);
2256 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2260 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2263 std::vector<llvm::Instruction*> placeholders;
2264 std::string prefix(
"__placeholder__literal_");
2265 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2266 e = llvm::inst_end(row_func_with_hoisted_literals);
2269 if (it->hasName() && it->getName().startswith(prefix)) {
2270 auto offset_and_index_entry =
2271 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2272 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2274 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2275 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2277 it->replaceAllUsesWith(
2278 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2279 placeholders.push_back(&*it);
2282 for (
auto placeholder : placeholders) {
2283 placeholder->removeFromParent();
2286 if (cgen_state_->filter_func_) {
2290 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2291 filter_func_with_hoisted_literals->begin(),
2292 cgen_state_->filter_func_->getBasicBlockList());
2296 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2297 E = cgen_state_->filter_func_->arg_end(),
2298 I2 = filter_func_with_hoisted_literals->arg_begin();
2301 I->replaceAllUsesWith(&*I2);
2306 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2309 std::vector<llvm::Instruction*> placeholders;
2310 std::string prefix(
"__placeholder__literal_");
2311 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2312 e = llvm::inst_end(filter_func_with_hoisted_literals);
2315 if (it->hasName() && it->getName().startswith(prefix)) {
2316 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2317 llvm::dyn_cast<llvm::Value>(&*it));
2318 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2320 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2321 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2323 it->replaceAllUsesWith(
2324 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2325 placeholders.push_back(&*it);
2328 for (
auto placeholder : placeholders) {
2329 placeholder->removeFromParent();
2333 return hoisted_literals;
2340 return shared_mem_used
2347 if (
auto const agg_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2348 if (shared::is_any<SQLAgg::kCOUNT, SQLAgg::kCOUNT_IF>(agg_expr->get_aggtype())) {
2358 CaseExprDetector() : detect_case_expr_(
false) {}
2362 return detect_case_expr_;
2367 detect_case_expr_ =
true;
2372 mutable bool detect_case_expr_;
2379 CaseExprDetector detector;
2381 if (detector.detectCaseExpr(expr.get())) {
2392 const unsigned cuda_blocksize,
2393 const unsigned num_blocks_per_mp) {
2400 CHECK(query_mem_desc_ptr);
2424 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2429 const auto target_infos =
2432 if (std::find_if(target_infos.begin(),
2435 if (ti.sql_type.is_varlen() ||
2436 !supported_aggs.count(ti.agg_kind)) {
2441 }) == target_infos.end()) {
2456 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2468 const size_t shared_memory_threshold_bytes = std::min(
2471 const auto output_buffer_size =
2473 if (output_buffer_size > shared_memory_threshold_bytes) {
2480 const auto target_infos =
2486 if (std::find_if(target_infos.begin(),
2489 if (ti.sql_type.is_varlen() ||
2490 !supported_aggs.count(ti.agg_kind)) {
2495 }) == target_infos.end()) {
2506 std::string llvm_ir;
2507 std::unordered_set<llvm::MDNode*> md;
2510 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2511 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2512 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2513 instr_it->getAllMetadata(imd);
2514 for (
auto [kind, node] : imd) {
2521 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2523 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2524 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2525 instr_it->getAllMetadata(imd);
2526 for (
auto [kind, node] : imd) {
2537 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2538 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2539 instr_it->getAllMetadata(imd);
2540 for (
auto [kind, node] : imd) {
2549 std::map<size_t, std::string> sorted_strings;
2552 llvm::raw_string_ostream os(str);
2553 p->print(os, cgen_state->
module_,
true);
2555 auto fields =
split(str, {}, 1);
2556 if (fields.empty() || fields[0].empty()) {
2559 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2562 for (
auto [
id, text] : sorted_strings) {
2573 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2580 const bool allow_lazy_fetch,
2581 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2582 const size_t max_groups_buffer_entry_guess,
2583 const int8_t crt_min_byte_width,
2584 const bool has_cardinality_estimation,
2596 static std::uint64_t counter = 0;
2598 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2599 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2601 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2611 addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2619 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2623 max_groups_buffer_entry_guess,
2630 !has_cardinality_estimation && (!render_info || !render_info->
isInSitu()) &&
2632 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2636 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2637 const bool gpu_shared_mem_optimization =
2642 cuda_mgr ? this->blockSize() : 1,
2643 cuda_mgr ?
this->numBlocksPerMP() : 1);
2644 if (gpu_shared_mem_optimization) {
2647 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2658 const size_t num_count_distinct_descs =
2660 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2661 const auto& count_distinct_descriptor =
2674 if (
auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2675 bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 :
false;
2676 if (gby_expr->get_aggtype() ==
SQLAgg::kSAMPLE && has_multiple_gpus &&
2679 bool (*)(
const Analyzer::ColumnVar*,
const Analyzer::ColumnVar*)>
2682 for (
const auto cv : colvar_set) {
2683 if (cv->get_type_info().is_varlen()) {
2684 const auto tbl_key = cv->getTableKey();
2685 std::for_each(query_infos.begin(),
2688 if (input_table_info.table_key == tbl_key &&
2689 input_table_info.info.fragments.size() > 1) {
2703 CHECK(cgen_state_->module_ ==
nullptr);
2704 cgen_state_->set_module_shallow_copy(get_rt_module(),
true);
2711 if (has_udf_module(is_gpu)) {
2713 get_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2715 if (has_rt_udf_module(is_gpu)) {
2717 get_rt_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2725 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2728 auto [query_func, row_func_call] = is_group_by
2738 !!ra_exe_unit.estimator,
2744 cgen_state_->query_func_ = query_func;
2745 cgen_state_->row_func_call_ = row_func_call;
2746 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2747 &query_func->getEntryBlock().front());
2751 auto& fetch_bb = query_func->front();
2752 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2753 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2757 cgen_state_->context_);
2761 is_group_by ? 0 : agg_slot_count,
2763 cgen_state_->module_,
2764 cgen_state_->context_);
2765 CHECK(cgen_state_->row_func_);
2766 cgen_state_->row_func_bb_ =
2767 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2770 auto filter_func_ft =
2771 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2772 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2773 llvm::Function::ExternalLinkage,
2775 cgen_state_->module_);
2776 CHECK(cgen_state_->filter_func_);
2777 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2778 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2781 cgen_state_->current_func_ = cgen_state_->row_func_;
2782 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2784 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2786 const auto join_loops =
2787 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2791 plan_state_->addSimpleQual(simple_qual);
2793 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2794 if (is_not_deleted_bb) {
2795 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2797 if (!join_loops.empty()) {
2798 codegenJoinLoops(join_loops,
2799 body_execution_unit,
2800 group_by_and_aggregate,
2802 cgen_state_->row_func_bb_,
2807 const bool can_return_error = compileBody(
2808 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2811 createErrorCheckControlFlow(query_func,
2816 group_by_and_aggregate.query_infos_);
2819 std::vector<llvm::Value*> hoisted_literals;
2822 VLOG(1) <<
"number of hoisted literals: "
2823 << cgen_state_->query_func_literal_loads_.size()
2824 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2828 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2830 hoisted_literals = inlineHoistedLiterals();
2834 std::vector<llvm::Value*> row_func_args;
2835 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2836 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2838 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2839 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2842 row_func_args.insert(
2843 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2844 llvm::ReplaceInstWithInst(
2845 cgen_state_->row_func_call_,
2846 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2849 if (cgen_state_->filter_func_) {
2850 std::vector<llvm::Value*> filter_func_args;
2851 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2852 arg_it != cgen_state_->filter_func_args_.end();
2854 filter_func_args.push_back(arg_it->first);
2856 llvm::ReplaceInstWithInst(
2857 cgen_state_->filter_func_call_,
2858 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2862 plan_state_->init_agg_vals_ =
2872 if (gpu_smem_context.isSharedMemoryUsed()) {
2876 cgen_state_->module_,
2877 cgen_state_->context_,
2880 plan_state_->init_agg_vals_,
2882 gpu_smem_code.codegen();
2883 gpu_smem_code.injectFunctionsInto(query_func);
2886 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2887 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2888 LOG(
IR) << gpu_smem_code.toString();
2892 auto multifrag_query_func = cgen_state_->module_->getFunction(
2893 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2894 CHECK(multifrag_query_func);
2897 insertErrorCodeChecker(multifrag_query_func,
2904 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2905 multifrag_query_func,
2906 cgen_state_->module_);
2908 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2909 if (cgen_state_->filter_func_) {
2910 root_funcs.push_back(cgen_state_->filter_func_);
2913 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2920 if (cgen_state_->filter_func_) {
2931 std::string llvm_ir =
2935 VLOG(3) <<
"Unoptimized IR for the " << device_str <<
"\n" << llvm_ir <<
"\nEnd of IR";
2937 #ifdef WITH_JIT_DEBUG
2938 throw std::runtime_error(
2939 "Explain optimized not available when JIT runtime debug symbols are enabled");
2943 llvm::legacy::PassManager pass_manager;
2945 cgen_state_->module_,
2948 gpu_smem_context.isSharedMemoryUsed(),
2950 #endif // WITH_JIT_DEBUG
2961 LOG(
IR) <<
"IR for the " << device_str;
2973 AutoTrackBuffersInRuntimeIR();
2977 if (cgen_state_->filter_func_) {
2982 return std::make_tuple(
2985 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2986 : optimizeAndCodegenGPU(query_func,
2987 multifrag_query_func,
2989 is_group_by || ra_exe_unit.estimator,
2991 gpu_smem_context.isSharedMemoryUsed(),
2993 cgen_state_->getLiterals(),
2996 std::move(gpu_smem_context)},
3001 unsigned const error_code_idx,
3002 bool hoist_literals,
3003 bool allow_runtime_query_interrupt) {
3004 auto query_stub_func_name =
3005 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
3006 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
3007 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
3008 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
3011 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
3013 if (row_func_name && *row_func_name == query_stub_func_name) {
3014 auto next_inst_it = inst_it;
3016 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
3017 auto& br_instr = bb_it->back();
3018 llvm::IRBuilder<> ir_builder(&br_instr);
3019 llvm::Value* err_lv = &*inst_it;
3020 auto error_check_bb =
3021 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
3023 llvm::Value*
const error_code_arg =
get_arg_by_index(query_func, error_code_idx);
3024 CHECK(error_code_arg) << error_code_idx <<
'/' << query_func->arg_size();
3025 llvm::Value* err_code =
nullptr;
3026 if (allow_runtime_query_interrupt) {
3028 auto& check_interrupt_br_instr = bb_it->back();
3029 auto interrupt_check_bb = llvm::BasicBlock::Create(
3030 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
3031 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
3032 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
3033 cgen_state_->module_->getFunction(
"check_interrupt"), {});
3034 auto detected_error = interrupt_checker_ir_builder.CreateCall(
3035 cgen_state_->module_->getFunction(
"get_error_code"),
3036 std::vector<llvm::Value*>{error_code_arg});
3037 err_code = interrupt_checker_ir_builder.CreateSelect(
3039 cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
3041 interrupt_checker_ir_builder.CreateBr(error_check_bb);
3042 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
3043 llvm::BranchInst::Create(interrupt_check_bb));
3044 ir_builder.SetInsertPoint(&br_instr);
3047 ir_builder.SetInsertPoint(&br_instr);
3049 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
3050 std::vector<llvm::Value*>{error_code_arg});
3052 err_lv = ir_builder.CreateICmp(
3053 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
3054 auto error_bb = llvm::BasicBlock::Create(
3055 cgen_state_->context_,
".error_exit", query_func, new_bb);
3056 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
3057 std::vector<llvm::Value*>{err_code, error_code_arg},
3060 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
3061 llvm::ReplaceInstWithInst(&br_instr,
3062 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
3077 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
3081 const auto& table_key = outer_input_desc.getTableKey();
3082 const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
3086 CHECK(deleted_cd->columnType.is_boolean());
3087 const auto deleted_expr =
3088 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3090 outer_input_desc.getNestLevel());
3092 const auto is_deleted =
3093 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
3094 const auto is_deleted_bb = llvm::BasicBlock::Create(
3095 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
3096 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3097 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
3098 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3099 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3100 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3101 cgen_state_->ir_builder_.SetInsertPoint(bb);
3116 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3117 llvm::Value* loop_done{
nullptr};
3118 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3119 if (cgen_state_->filter_func_) {
3120 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3121 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3122 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3123 row_func_entry_bb->begin());
3124 loop_done = cgen_state_->ir_builder_.CreateAlloca(
3125 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
3126 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3127 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
3129 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3130 cgen_state_->current_func_ = cgen_state_->filter_func_;
3131 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3135 std::vector<Analyzer::Expr*> primary_quals;
3136 std::vector<Analyzer::Expr*> deferred_quals;
3138 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3139 if (short_circuited) {
3141 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
3144 llvm::Value* filter_lv = cgen_state_->llBool(
true);
3146 for (
auto expr : primary_quals) {
3148 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
3149 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3151 CHECK(filter_lv->getType()->isIntegerTy(1));
3152 llvm::BasicBlock* sc_false{
nullptr};
3153 if (!deferred_quals.empty()) {
3154 auto sc_true = llvm::BasicBlock::Create(
3155 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
3156 sc_false = llvm::BasicBlock::Create(
3157 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
3158 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3159 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3161 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3163 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3164 filter_lv = cgen_state_->llBool(
true);
3166 for (
auto expr : deferred_quals) {
3167 filter_lv = cgen_state_->ir_builder_.CreateAnd(
3168 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
3171 CHECK(filter_lv->getType()->isIntegerTy(1));
3172 auto ret = group_by_and_aggregate.
codegen(
3173 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3177 if (cgen_state_->filter_func_) {
3178 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3179 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
3180 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3183 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3184 cgen_state_->current_func_ = cgen_state_->row_func_;
3185 cgen_state_->filter_func_call_ =
3186 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3190 redeclareFilterFunction();
3192 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3193 auto loop_done_true = llvm::BasicBlock::Create(
3194 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
3195 auto loop_done_false = llvm::BasicBlock::Create(
3196 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
3197 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3198 loop_done->getType()->getPointerElementType(), loop_done);
3199 cgen_state_->ir_builder_.CreateCondBr(
3200 loop_done_flag, loop_done_true, loop_done_false);
3201 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3202 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3203 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3205 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3212 llvm::Value* byte_stream_arg,
3213 llvm::IRBuilder<>& ir_builder,
3214 llvm::LLVMContext& ctx) {
3215 CHECK(byte_stream_arg);
3216 const auto max_col_local_id = num_columns - 1;
3218 std::vector<llvm::Value*> col_heads;
3219 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3220 auto* gep = ir_builder.CreateGEP(
3221 byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3223 llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3224 auto* load_gep = ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
3225 load_gep->setName(byte_stream_arg->getName() +
"_" +
std::to_string(col_id) +
"_ptr");
3226 col_heads.emplace_back(load_gep);
3230 void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::vector< Analyzer::Expr * > target_exprs
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
bool with_dynamic_watchdog
Streaming Top N algorithm.
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
bool hasKeylessHash() const
void insertErrorCodeChecker(llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
std::vector< CUjit_option > option_keys
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
bool g_enable_smem_non_grouped_agg
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
llvm::StringRef get_gpu_target_triple_string()
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
unsigned get_index_by_name(llvm::Function *func, const std::string &name)
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
QueryDescriptionType getQueryDescriptionType() const
static std::mutex initialize_cpu_backend_mutex_
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
std::vector< void * > option_values
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
bool has_count_expr(RelAlgExecutionUnit const &ra_exe_unit)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
void show_defined(llvm::Module &llvm_module)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
bool g_enable_filter_function
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
virtual T visitCaseExpr(const Analyzer::CaseExpr *case_) const
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::string filename(char const *path)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
bool g_enable_watchdog false
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
bool isArchPascal() const
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
constexpr std::array< std::string_view, 18 > TARGET_RUNTIME_FUNCTIONS_FOR_MODULE_CLONING
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
llvm::Value * get_arg_by_index(llvm::Function *func, unsigned const index)
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const &ra_exe_unit)
static std::mutex initialize_nvptx_mutex_
size_t g_gpu_smem_threshold