OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NvidiaKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "NvidiaKernel.h"
18 #include "Logger/Logger.h"
20 
21 #include <boost/filesystem/operations.hpp>
22 
23 #include <sstream>
24 
25 #ifdef HAVE_CUDA
26 
28  : cubin(nullptr), link_state(CUlinkState{}), cubin_size(0u), jit_wall_time_idx(0u) {
29  constexpr size_t JIT_LOG_SIZE = 8192u;
30  static_assert(0u < JIT_LOG_SIZE);
31  info_log.resize(JIT_LOG_SIZE - 1u); // minus 1 for null terminator
32  error_log.resize(JIT_LOG_SIZE - 1u);
33  std::pair<CUjit_option, void*> options[] = {
34  {CU_JIT_LOG_VERBOSE, reinterpret_cast<void*>(1)},
35  // fix the minimum # threads per block to the hardware-limit maximum num threads to
36  // avoid recompiling jit module even if we manipulate it via query hint (and allowed
37  // `CU_JIT_THREADS_PER_BLOCK` range is between 1 and 1024 by query hint)
38  {CU_JIT_THREADS_PER_BLOCK, reinterpret_cast<void*>(1024)},
39  {CU_JIT_WALL_TIME, nullptr}, // input not read, only output
40  {CU_JIT_INFO_LOG_BUFFER, reinterpret_cast<void*>(&info_log[0])},
41  {CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, reinterpret_cast<void*>(JIT_LOG_SIZE)},
42  {CU_JIT_ERROR_LOG_BUFFER, reinterpret_cast<void*>(&error_log[0])},
43  {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, reinterpret_cast<void*>(JIT_LOG_SIZE)}};
44  constexpr size_t n_options = sizeof(options) / sizeof(*options);
45  option_keys.reserve(n_options);
46  option_values.reserve(n_options);
47  for (size_t i = 0; i < n_options; ++i) {
48  option_keys.push_back(options[i].first);
49  option_values.push_back(options[i].second);
50  if (options[i].first == CU_JIT_WALL_TIME) {
51  jit_wall_time_idx = i;
52  }
53  }
54  CHECK_EQ(CU_JIT_WALL_TIME, option_keys[jit_wall_time_idx]) << jit_wall_time_idx;
55 }
56 
57 namespace {
58 
59 boost::filesystem::path get_gpu_rt_path() {
60  boost::filesystem::path gpu_rt_path{heavyai::get_root_abs_path()};
61  gpu_rt_path /= "QueryEngine";
62  gpu_rt_path /= "cuda_mapd_rt.fatbin";
63  if (!boost::filesystem::exists(gpu_rt_path)) {
64  throw std::runtime_error("HeavyDB GPU runtime library not found at " +
65  gpu_rt_path.string());
66  }
67  return gpu_rt_path;
68 }
69 
70 boost::filesystem::path get_cuda_table_functions_path() {
71  boost::filesystem::path cuda_table_functions_path{heavyai::get_root_abs_path()};
72  cuda_table_functions_path /= "QueryEngine";
73  cuda_table_functions_path /= "CudaTableFunctions.a";
74  if (!boost::filesystem::exists(cuda_table_functions_path)) {
75  throw std::runtime_error("HeavyDB GPU table functions module not found at " +
76  cuda_table_functions_path.string());
77  }
78 
79  return cuda_table_functions_path;
80 }
81 
82 } // namespace
83 
84 void nvidia_jit_warmup() {
85  CubinResult cubin_result{};
86  CHECK_EQ(cubin_result.option_values.size(), cubin_result.option_keys.size());
87  unsigned const num_options = cubin_result.option_keys.size();
88  checkCudaErrors(cuLinkCreate(num_options,
89  cubin_result.option_keys.data(),
90  cubin_result.option_values.data(),
91  &cubin_result.link_state))
92  << ": " << cubin_result.error_log.c_str();
93  VLOG(1) << "CUDA JIT time to create link: " << cubin_result.jitWallTime();
94  boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
95  boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
96  CHECK(!gpu_rt_path.empty());
97  CHECK(!cuda_table_functions_path.empty());
98  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
99  CU_JIT_INPUT_FATBINARY,
100  gpu_rt_path.c_str(),
101  0,
102  nullptr,
103  nullptr))
104  << ": " << cubin_result.error_log.c_str();
105  VLOG(1) << "CUDA JIT time to add RT fatbinary: " << cubin_result.jitWallTime();
106  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
107  CU_JIT_INPUT_LIBRARY,
108  cuda_table_functions_path.c_str(),
109  0,
110  nullptr,
111  nullptr))
112  << ": " << cubin_result.error_log.c_str();
113  VLOG(1) << "CUDA JIT time to add GPU table functions library: "
114  << cubin_result.jitWallTime();
115  checkCudaErrors(cuLinkDestroy(cubin_result.link_state))
116  << ": " << cubin_result.error_log.c_str();
117 }
118 
119 std::string add_line_numbers(const std::string& text) {
120  std::stringstream iss(text);
121  std::string result;
122  size_t count = 1;
123  while (iss.good()) {
124  std::string line;
125  std::getline(iss, line, '\n');
126  result += std::to_string(count) + ": " + line + "\n";
127  count++;
128  }
129  return result;
130 }
131 
132 CubinResult ptx_to_cubin(const std::string& ptx,
133  const CudaMgr_Namespace::CudaMgr* cuda_mgr) {
134  auto timer = DEBUG_TIMER(__func__);
135  CHECK(!ptx.empty());
136  CHECK(cuda_mgr && cuda_mgr->getDeviceCount() > 0);
137  cuda_mgr->setContext(0);
138  CubinResult cubin_result{};
139  CHECK_EQ(cubin_result.option_values.size(), cubin_result.option_keys.size());
140  checkCudaErrors(cuLinkCreate(cubin_result.option_keys.size(),
141  cubin_result.option_keys.data(),
142  cubin_result.option_values.data(),
143  &cubin_result.link_state))
144  << ": " << cubin_result.error_log.c_str();
145  VLOG(1) << "CUDA JIT time to create link: " << cubin_result.jitWallTime();
146 
147  boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
148  boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
149  CHECK(!gpu_rt_path.empty());
150  CHECK(!cuda_table_functions_path.empty());
151  // How to create a static CUDA library:
152  // 1. nvcc -std=c++11 -arch=sm_35 --device-link -c [list of .cu files]
153  // 2. nvcc -std=c++11 -arch=sm_35 -lib [list of .o files generated by step 1] -o
154  // [library_name.a]
155  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
156  CU_JIT_INPUT_FATBINARY,
157  gpu_rt_path.c_str(),
158  0,
159  nullptr,
160  nullptr))
161  << ": " << cubin_result.error_log.c_str();
162  VLOG(1) << "CUDA JIT time to add RT fatbinary: " << cubin_result.jitWallTime();
163  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
164  CU_JIT_INPUT_LIBRARY,
165  cuda_table_functions_path.c_str(),
166  0,
167  nullptr,
168  nullptr))
169  << ": " << cubin_result.error_log.c_str();
170  VLOG(1) << "CUDA JIT time to add GPU table functions library: "
171  << cubin_result.jitWallTime();
172  // The ptx.length() + 1 follows the example in
173  // https://developer.nvidia.com/blog/discovering-new-features-in-cuda-11-4/
174  checkCudaErrors(cuLinkAddData(cubin_result.link_state,
175  CU_JIT_INPUT_PTX,
176  static_cast<void*>(const_cast<char*>(ptx.c_str())),
177  ptx.length() + 1,
178  0,
179  0,
180  nullptr,
181  nullptr))
182  << ": " << cubin_result.error_log.c_str() << "\nPTX:\n"
183  << add_line_numbers(ptx) << "\nEOF PTX";
184  VLOG(1) << "CUDA JIT time to add generated code: " << cubin_result.jitWallTime();
185  checkCudaErrors(cuLinkComplete(
186  cubin_result.link_state, &cubin_result.cubin, &cubin_result.cubin_size))
187  << ": " << cubin_result.error_log.c_str();
188  VLOG(1) << "CUDA Linker completed: " << cubin_result.info_log.c_str();
189  CHECK(cubin_result.cubin);
190  CHECK_LT(0u, cubin_result.cubin_size);
191  VLOG(1) << "Generated GPU binary code size: " << cubin_result.cubin_size << " bytes";
192  return cubin_result;
193 }
194 
196  const size_t module_size,
197  const std::string& kernel_name,
198  const int device_id,
199  const void* cuda_mgr,
200  unsigned int num_options,
201  CUjit_option* options,
202  void** option_vals)
203  : module_(nullptr)
204  , module_size_(module_size)
205  , kernel_(nullptr)
206  , kernel_name_(kernel_name)
207  , device_id_(device_id)
208  , cuda_mgr_(static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr)) {
209  LOG_IF(FATAL, cuda_mgr_ == nullptr)
210  << "Unable to initialize GPU compilation context without CUDA manager";
211  cuda_mgr_->loadGpuModuleData(
212  &module_, image, num_options, options, option_vals, device_id_);
213  CHECK(module_);
214  checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name_.c_str()));
215 }
216 #endif // HAVE_CUDA
217 
219 #ifdef HAVE_CUDA
220  CHECK(cuda_mgr_);
221  cuda_mgr_->unloadGpuModuleData(&module_, device_id_);
222 #endif
223 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
int CUjit_option
Definition: nocuda.h:26
std::string get_root_abs_path()
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void nvidia_jit_warmup()
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
std::string to_string(char const *&&v)
#define LOG_IF(severity, condition)
Definition: Logger.h:384
int getDeviceCount() const
Definition: CudaMgr.h:90
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
GpuDeviceCompilationContext(const void *image, const size_t module_size, const std::string &kernel_name, const int device_id, const void *cuda_mgr, unsigned int num_options, CUjit_option *options, void **option_vals)
int CUlinkState
Definition: nocuda.h:27
#define CHECK_LT(x, y)
Definition: Logger.h:303
tuple line
Definition: parse_ast.py:10
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
#define VLOG(n)
Definition: Logger.h:388