OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuMemUtils.cpp File Reference
+ Include dependency graph for GpuMemUtils.cpp:

Go to the source code of this file.

Namespaces

 anonymous_namespace{GpuMemUtils.cpp}
 

Functions

void copy_to_nvidia_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
 
size_t anonymous_namespace{GpuMemUtils.cpp}::coalesced_size (const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
 
GpuGroupByBuffers create_dev_group_by_buffers (DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
 
void copy_group_by_buffers_from_gpu (DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
 
size_t get_num_allocated_rows_from_gpu (DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id)
 
void copy_projection_buffer_from_gpu_columnar (Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
 

Variables

size_t g_max_memory_allocation_size
 
size_t g_min_memory_allocation_size
 
double g_bump_allocator_step_reduction
 

Function Documentation

void copy_group_by_buffers_from_gpu ( DeviceAllocator device_allocator,
const std::vector< int64_t * > &  group_by_buffers,
const size_t  groups_buffer_size,
const int8_t *  group_by_dev_buffers_mem,
const QueryMemoryDescriptor query_mem_desc,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer,
const bool  has_varlen_output 
)

Definition at line 228 of file GpuMemUtils.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), CHECK_EQ, CHECK_LT, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyFromDevice(), and QueryMemoryDescriptor::getEntryCount().

Referenced by QueryMemoryInitializer::copyGroupByBuffersFromGpu(), and ResultSet::radixSortOnGpu().

237  {
238  if (group_by_buffers.empty()) {
239  return;
240  }
241  const size_t first_group_buffer_idx = has_varlen_output ? 1 : 0;
242 
243  const unsigned block_buffer_count{query_mem_desc.blocksShareMemory() ? 1 : grid_size_x};
244  if (block_buffer_count == 1 && !prepend_index_buffer) {
245  CHECK_EQ(coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count),
246  groups_buffer_size);
247  device_allocator.copyFromDevice(group_by_buffers[first_group_buffer_idx],
248  group_by_dev_buffers_mem,
249  groups_buffer_size);
250  return;
251  }
252  const size_t index_buffer_sz{
253  prepend_index_buffer ? query_mem_desc.getEntryCount() * sizeof(int64_t) : 0};
254  std::vector<int8_t> buff_from_gpu(
255  coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
256  index_buffer_sz);
257  device_allocator.copyFromDevice(&buff_from_gpu[0],
258  group_by_dev_buffers_mem - index_buffer_sz,
259  buff_from_gpu.size());
260  auto buff_from_gpu_ptr = &buff_from_gpu[0];
261  for (size_t i = 0; i < block_buffer_count; ++i) {
262  const size_t buffer_idx = (i * block_size_x) + first_group_buffer_idx;
263  CHECK_LT(buffer_idx, group_by_buffers.size());
264  memcpy(group_by_buffers[buffer_idx],
265  buff_from_gpu_ptr,
266  groups_buffer_size + index_buffer_sz);
267  buff_from_gpu_ptr += groups_buffer_size;
268  }
269 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define CHECK_LT(x, y)
Definition: Logger.h:303
virtual void copyFromDevice(void *host_dst, const void *device_src, const size_t num_bytes) const =0
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
Definition: GpuMemUtils.cpp:61

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void copy_projection_buffer_from_gpu_columnar ( Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const QueryMemoryDescriptor query_mem_desc,
int8_t *  projection_buffer,
const size_t  projection_count,
const int  device_id 
)

For projection queries we only copy back as many elements as necessary, not the whole output buffer. The goal is to be able to build a compact ResultSet, particularly useful for columnar outputs.

NOTE: Saman: we should revisit this function when we have a bump allocator

Definition at line 293 of file GpuMemUtils.cpp.

References align_to_int64(), CHECK, GpuGroupByBuffers::data, QueryMemoryDescriptor::didOutputColumnar(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), getQueryEngineCudaStreamForDevice(), QueryMemoryDescriptor::getSlotCount(), and heavyai::Projection.

Referenced by QueryMemoryInitializer::compactProjectionBuffersGpu().

299  {
300 #ifdef HAVE_CUDA
301  CHECK(query_mem_desc.didOutputColumnar());
303  constexpr size_t row_index_width = sizeof(int64_t);
304 
305  auto allocator = std::make_unique<CudaAllocator>(
306  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
307  // copy all the row indices back to the host
308  allocator->copyFromDevice(
309  projection_buffer, gpu_group_by_buffers.data, projection_count * row_index_width);
310  size_t buffer_offset_cpu{projection_count * row_index_width};
311  // other columns are actual non-lazy columns for the projection:
312  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
313  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
314  const auto column_proj_size =
315  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
316  allocator->copyFromDevice(
317  projection_buffer + buffer_offset_cpu,
318  gpu_group_by_buffers.data + query_mem_desc.getColOffInBytes(i),
319  column_proj_size);
320  buffer_offset_cpu += align_to_int64(column_proj_size);
321  }
322  }
323 #else
324  CHECK(false);
325 #endif // HAVE_CUDA
326 }
Projection
Definition: enums.h:58
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
#define CHECK(condition)
Definition: Logger.h:291
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void copy_to_nvidia_gpu ( Data_Namespace::DataMgr data_mgr,
CUdeviceptr  dst,
const void *  src,
const size_t  num_bytes,
const int  device_id 
)

Definition at line 35 of file GpuMemUtils.cpp.

References CHECK, checkCudaErrors(), Data_Namespace::DataMgr::getCudaMgr(), and getQueryEngineCudaStreamForDevice().

Referenced by TreeModelPredictionMgr::createKernelBuffers(), StringDictionaryTranslationMgr::createKernelBuffers(), and anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr().

39  {
40 #ifdef HAVE_CUDA
41  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
42  if (!data_mgr) { // only for unit tests
43  checkCudaErrors(cuMemcpyHtoDAsync(dst, src, num_bytes, qe_cuda_stream));
44  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
45  return;
46  }
47  const auto cuda_mgr = data_mgr->getCudaMgr();
48  CHECK(cuda_mgr);
49  cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
50  static_cast<const int8_t*>(src),
51  num_bytes,
52  device_id,
53  qe_cuda_stream);
54 #else
55  CHECK(false);
56 #endif // HAVE_CUDA
57 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:177
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GpuGroupByBuffers create_dev_group_by_buffers ( DeviceAllocator device_allocator,
const std::vector< int64_t * > &  group_by_buffers,
const QueryMemoryDescriptor query_mem_desc,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const int64_t  num_input_rows,
const bool  prepend_index_buffer,
const bool  always_init_group_by_on_host,
const bool  use_bump_allocator,
const bool  has_varlen_output,
Allocator insitu_allocator 
)

Definition at line 70 of file GpuMemUtils.cpp.

References align_to_int64(), Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK, CHECK_GT, CHECK_LE, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyToDevice(), g_bump_allocator_step_reduction, g_max_memory_allocation_size, g_min_memory_allocation_size, QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, logger::INFO, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), LOG, QueryMemoryDescriptor::threadsShareMemory(), to_string(), QueryMemoryDescriptor::varlenOutputBufferElemSize(), and logger::WARNING.

Referenced by QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(), and ResultSet::radixSortOnGpu().

83  {
84  if (group_by_buffers.empty() && !insitu_allocator) {
85  return {0, 0, 0, 0};
86  }
87  CHECK(device_allocator);
88 
89  size_t groups_buffer_size{0};
90  int8_t* group_by_dev_buffers_mem{nullptr};
91  size_t mem_size{0};
92  size_t entry_count{0};
93 
94  if (use_bump_allocator) {
95  CHECK(!prepend_index_buffer);
96  CHECK(!insitu_allocator);
97 
98  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
99  // Allocate an output buffer equal to the size of the number of rows in the
100  // fragment. The kernel per fragment path is only used for projections with lazy
101  // fetched outputs. Therefore, the resulting output buffer should be relatively
102  // narrow compared to the width of an input row, offsetting the larger allocation.
103 
104  CHECK_GT(num_input_rows, int64_t(0));
105  entry_count = num_input_rows;
106  groups_buffer_size =
107  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
108  mem_size = coalesced_size(query_mem_desc,
109  groups_buffer_size,
110  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
111  // TODO(adb): render allocator support
112  group_by_dev_buffers_mem = device_allocator->alloc(mem_size);
113  } else {
114  // Attempt to allocate increasingly small buffers until we have less than 256B of
115  // memory remaining on the device. This may have the side effect of evicting
116  // memory allocated for previous queries. However, at current maximum slab sizes
117  // (2GB) we expect these effects to be minimal.
118  size_t max_memory_size{g_max_memory_allocation_size};
119  while (true) {
120  entry_count = max_memory_size / query_mem_desc.getRowSize();
121  groups_buffer_size =
122  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
123 
124  try {
125  mem_size = coalesced_size(query_mem_desc,
126  groups_buffer_size,
127  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
128  CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
129 
130  // TODO(adb): render allocator support
131  group_by_dev_buffers_mem = device_allocator->alloc(mem_size);
132  } catch (const OutOfMemory& e) {
133  LOG(WARNING) << e.what();
134  max_memory_size = max_memory_size * g_bump_allocator_step_reduction;
135  if (max_memory_size < g_min_memory_allocation_size) {
136  throw;
137  }
138 
139  LOG(WARNING) << "Ran out of memory for projection query output. Retrying with "
140  << std::to_string(max_memory_size) << " bytes";
141 
142  continue;
143  }
144  break;
145  }
146  }
147  LOG(INFO) << "Projection query allocation succeeded with " << groups_buffer_size
148  << " bytes allocated (max entry count " << entry_count << ")";
149  } else {
150  entry_count = query_mem_desc.getEntryCount();
151  CHECK_GT(entry_count, size_t(0));
152  groups_buffer_size =
153  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
154  mem_size = coalesced_size(query_mem_desc,
155  groups_buffer_size,
156  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
157  const size_t prepended_buff_size{
158  prepend_index_buffer ? align_to_int64(entry_count * sizeof(int32_t)) : 0};
159 
160  int8_t* group_by_dev_buffers_allocation{nullptr};
161  if (insitu_allocator) {
162  group_by_dev_buffers_allocation =
163  insitu_allocator->alloc(mem_size + prepended_buff_size);
164  } else {
165  group_by_dev_buffers_allocation =
166  device_allocator->alloc(mem_size + prepended_buff_size);
167  }
168  CHECK(group_by_dev_buffers_allocation);
169 
170  group_by_dev_buffers_mem = group_by_dev_buffers_allocation + prepended_buff_size;
171  }
172  CHECK_GT(groups_buffer_size, size_t(0));
173  CHECK(group_by_dev_buffers_mem);
174 
175  CHECK(query_mem_desc.threadsShareMemory());
176  const size_t step{block_size_x};
177 
178  if (!insitu_allocator && (always_init_group_by_on_host ||
179  !query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU))) {
180  std::vector<int8_t> buff_to_gpu(mem_size);
181  auto buff_to_gpu_ptr = buff_to_gpu.data();
182 
183  const size_t start = has_varlen_output ? 1 : 0;
184  for (size_t i = start; i < group_by_buffers.size(); i += step) {
185  memcpy(buff_to_gpu_ptr, group_by_buffers[i], groups_buffer_size);
186  buff_to_gpu_ptr += groups_buffer_size;
187  }
188  device_allocator->copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
189  buff_to_gpu.data(),
190  buff_to_gpu.size());
191  }
192 
193  auto group_by_dev_buffer = group_by_dev_buffers_mem;
194 
195  const size_t num_ptrs =
196  (block_size_x * grid_size_x) + (has_varlen_output ? size_t(1) : size_t(0));
197 
198  std::vector<int8_t*> group_by_dev_buffers(num_ptrs);
199 
200  const size_t start_index = has_varlen_output ? 1 : 0;
201  for (size_t i = start_index; i < num_ptrs; i += step) {
202  for (size_t j = 0; j < step; ++j) {
203  group_by_dev_buffers[i + j] = group_by_dev_buffer;
204  }
205  if (!query_mem_desc.blocksShareMemory()) {
206  group_by_dev_buffer += groups_buffer_size;
207  }
208  }
209 
210  int8_t* varlen_output_buffer{nullptr};
211  if (has_varlen_output) {
212  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
213  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
214 
215  group_by_dev_buffers[0] = device_allocator->alloc(
216  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value());
217  varlen_output_buffer = group_by_dev_buffers[0];
218  }
219 
220  auto group_by_dev_ptr = device_allocator->alloc(num_ptrs * sizeof(CUdeviceptr));
221  device_allocator->copyToDevice(group_by_dev_ptr,
222  reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
223  num_ptrs * sizeof(CUdeviceptr));
224 
225  return {group_by_dev_ptr, group_by_dev_buffers_mem, entry_count, varlen_output_buffer};
226 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
double g_bump_allocator_step_reduction
Definition: Execute.cpp:133
#define LOG(tag)
Definition: Logger.h:285
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:128
size_t g_min_memory_allocation_size
Definition: Execute.cpp:129
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
Definition: GpuMemUtils.cpp:61
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t get_num_allocated_rows_from_gpu ( DeviceAllocator device_allocator,
int8_t *  projection_size_gpu,
const int  device_id 
)

Returns back total number of allocated rows per device (i.e., number of matched elements in projections).

TODO(Saman): revisit this for bump allocators

Definition at line 277 of file GpuMemUtils.cpp.

References CHECK, and DeviceAllocator::copyFromDevice().

Referenced by QueryExecutionContext::launchGpuCode().

279  {
280  int32_t num_rows{0};
281  device_allocator.copyFromDevice(&num_rows, projection_size_gpu, sizeof(num_rows));
282  CHECK(num_rows >= 0);
283  return static_cast<size_t>(num_rows);
284 }
virtual void copyFromDevice(void *host_dst, const void *device_src, const size_t num_bytes) const =0
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Variable Documentation

double g_bump_allocator_step_reduction

Definition at line 133 of file Execute.cpp.

size_t g_max_memory_allocation_size

Definition at line 128 of file Execute.cpp.

size_t g_min_memory_allocation_size

Definition at line 129 of file Execute.cpp.