28 #include "../CudaMgr/CudaMgr.h"
38 const size_t num_bytes,
39 const int device_id) {
43 checkCudaErrors(cuMemcpyHtoDAsync(dst, src, num_bytes, qe_cuda_stream));
49 cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
50 static_cast<const int8_t*>(src),
62 const size_t group_by_one_buffer_size,
63 const unsigned grid_size_x) {
65 return grid_size_x * group_by_one_buffer_size;
72 const std::vector<int64_t*>& group_by_buffers,
74 const unsigned block_size_x,
75 const unsigned grid_size_x,
78 const int64_t num_input_rows,
79 const bool prepend_index_buffer,
80 const bool always_init_group_by_on_host,
81 const bool use_bump_allocator,
82 const bool has_varlen_output,
84 if (group_by_buffers.empty() && !insitu_allocator) {
87 CHECK(device_allocator);
89 size_t groups_buffer_size{0};
90 int8_t* group_by_dev_buffers_mem{
nullptr};
92 size_t entry_count{0};
94 if (use_bump_allocator) {
95 CHECK(!prepend_index_buffer);
96 CHECK(!insitu_allocator);
104 CHECK_GT(num_input_rows, int64_t(0));
105 entry_count = num_input_rows;
112 group_by_dev_buffers_mem = device_allocator->
alloc(mem_size);
120 entry_count = max_memory_size / query_mem_desc.
getRowSize();
128 CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
131 group_by_dev_buffers_mem = device_allocator->
alloc(mem_size);
139 LOG(
WARNING) <<
"Ran out of memory for projection query output. Retrying with "
147 LOG(
INFO) <<
"Projection query allocation succeeded with " << groups_buffer_size
148 <<
" bytes allocated (max entry count " << entry_count <<
")";
157 const size_t prepended_buff_size{
158 prepend_index_buffer ?
align_to_int64(entry_count *
sizeof(int32_t)) : 0};
160 int8_t* group_by_dev_buffers_allocation{
nullptr};
161 if (insitu_allocator) {
162 group_by_dev_buffers_allocation =
163 insitu_allocator->
alloc(mem_size + prepended_buff_size);
165 group_by_dev_buffers_allocation =
166 device_allocator->
alloc(mem_size + prepended_buff_size);
168 CHECK(group_by_dev_buffers_allocation);
170 group_by_dev_buffers_mem = group_by_dev_buffers_allocation + prepended_buff_size;
172 CHECK_GT(groups_buffer_size,
size_t(0));
173 CHECK(group_by_dev_buffers_mem);
176 const size_t step{block_size_x};
178 if (!insitu_allocator && (always_init_group_by_on_host ||
180 std::vector<int8_t> buff_to_gpu(mem_size);
181 auto buff_to_gpu_ptr = buff_to_gpu.data();
183 const size_t start = has_varlen_output ? 1 : 0;
184 for (
size_t i = start; i < group_by_buffers.size(); i += step) {
185 memcpy(buff_to_gpu_ptr, group_by_buffers[i], groups_buffer_size);
186 buff_to_gpu_ptr += groups_buffer_size;
188 device_allocator->
copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
193 auto group_by_dev_buffer = group_by_dev_buffers_mem;
195 const size_t num_ptrs =
196 (block_size_x * grid_size_x) + (has_varlen_output ?
size_t(1) : size_t(0));
198 std::vector<int8_t*> group_by_dev_buffers(num_ptrs);
200 const size_t start_index = has_varlen_output ? 1 : 0;
201 for (
size_t i = start_index; i < num_ptrs; i += step) {
202 for (
size_t j = 0; j < step; ++j) {
203 group_by_dev_buffers[i + j] = group_by_dev_buffer;
206 group_by_dev_buffer += groups_buffer_size;
210 int8_t* varlen_output_buffer{
nullptr};
211 if (has_varlen_output) {
213 CHECK(varlen_buffer_elem_size_opt);
215 group_by_dev_buffers[0] = device_allocator->
alloc(
216 query_mem_desc.
getEntryCount() * varlen_buffer_elem_size_opt.value());
217 varlen_output_buffer = group_by_dev_buffers[0];
220 auto group_by_dev_ptr = device_allocator->
alloc(num_ptrs *
sizeof(
CUdeviceptr));
222 reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
225 return {group_by_dev_ptr, group_by_dev_buffers_mem, entry_count, varlen_output_buffer};
229 const std::vector<int64_t*>& group_by_buffers,
230 const size_t groups_buffer_size,
231 const int8_t* group_by_dev_buffers_mem,
233 const unsigned block_size_x,
234 const unsigned grid_size_x,
236 const bool prepend_index_buffer,
237 const bool has_varlen_output) {
238 if (group_by_buffers.empty()) {
241 const size_t first_group_buffer_idx = has_varlen_output ? 1 : 0;
243 const unsigned block_buffer_count{query_mem_desc.
blocksShareMemory() ? 1 : grid_size_x};
244 if (block_buffer_count == 1 && !prepend_index_buffer) {
247 device_allocator.
copyFromDevice(group_by_buffers[first_group_buffer_idx],
248 group_by_dev_buffers_mem,
252 const size_t index_buffer_sz{
253 prepend_index_buffer ? query_mem_desc.
getEntryCount() *
sizeof(int64_t) : 0};
254 std::vector<int8_t> buff_from_gpu(
255 coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
258 group_by_dev_buffers_mem - index_buffer_sz,
259 buff_from_gpu.size());
260 auto buff_from_gpu_ptr = &buff_from_gpu[0];
261 for (
size_t i = 0; i < block_buffer_count; ++i) {
262 const size_t buffer_idx = (i * block_size_x) + first_group_buffer_idx;
263 CHECK_LT(buffer_idx, group_by_buffers.size());
264 memcpy(group_by_buffers[buffer_idx],
266 groups_buffer_size + index_buffer_sz);
267 buff_from_gpu_ptr += groups_buffer_size;
278 int8_t* projection_size_gpu,
279 const int device_id) {
281 device_allocator.
copyFromDevice(&num_rows, projection_size_gpu,
sizeof(num_rows));
282 CHECK(num_rows >= 0);
283 return static_cast<size_t>(num_rows);
297 int8_t* projection_buffer,
298 const size_t projection_count,
299 const int device_id) {
303 constexpr
size_t row_index_width =
sizeof(int64_t);
305 auto allocator = std::make_unique<CudaAllocator>(
308 allocator->copyFromDevice(
309 projection_buffer, gpu_group_by_buffers.
data, projection_count * row_index_width);
310 size_t buffer_offset_cpu{projection_count * row_index_width};
312 for (
size_t i = 0; i < query_mem_desc.
getSlotCount(); i++) {
314 const auto column_proj_size =
316 allocator->copyFromDevice(
317 projection_buffer + buffer_offset_cpu,
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
size_t getSlotCount() const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
size_t getEntryCount() const
double g_bump_allocator_step_reduction
size_t get_num_allocated_rows_from_gpu(DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id)
Streaming Top N algorithm.
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getRowSize() const
bool blocksShareMemory() const
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t g_max_memory_allocation_size
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
size_t g_min_memory_allocation_size
std::optional< size_t > varlenOutputBufferElemSize() const
Abstract class for managing device memory allocations.
CUstream getQueryEngineCudaStreamForDevice(int device_num)
virtual void copyFromDevice(void *host_dst, const void *device_src, const size_t num_bytes) const =0
bool didOutputColumnar() const
void copy_to_nvidia_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
bool threadsShareMemory() const
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
size_t getColOffInBytes(const size_t col_idx) const
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)