OmniSciDB
a5dc49c757
|
#include "CompilationOptions.h"
#include <cstddef>
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "../Shared/nocuda.h"
Go to the source code of this file.
Classes | |
struct | GpuGroupByBuffers |
Namespaces | |
CudaMgr_Namespace | |
Data_Namespace | |
Functions | |
void | copy_to_nvidia_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id) |
GpuGroupByBuffers | create_dev_group_by_buffers (DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator) |
void | copy_group_by_buffers_from_gpu (DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output) |
size_t | get_num_allocated_rows_from_gpu (DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id) |
void | copy_projection_buffer_from_gpu_columnar (Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_query_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id) |
void copy_group_by_buffers_from_gpu | ( | DeviceAllocator & | device_allocator, |
const std::vector< int64_t * > & | group_by_buffers, | ||
const size_t | groups_buffer_size, | ||
const int8_t * | group_by_dev_buffers_mem, | ||
const QueryMemoryDescriptor & | query_mem_desc, | ||
const unsigned | block_size_x, | ||
const unsigned | grid_size_x, | ||
const int | device_id, | ||
const bool | prepend_index_buffer, | ||
const bool | has_varlen_output | ||
) |
Definition at line 228 of file GpuMemUtils.cpp.
References QueryMemoryDescriptor::blocksShareMemory(), CHECK_EQ, CHECK_LT, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyFromDevice(), and QueryMemoryDescriptor::getEntryCount().
Referenced by QueryMemoryInitializer::copyGroupByBuffersFromGpu(), and ResultSet::radixSortOnGpu().
void copy_projection_buffer_from_gpu_columnar | ( | Data_Namespace::DataMgr * | data_mgr, |
const GpuGroupByBuffers & | gpu_group_by_buffers, | ||
const QueryMemoryDescriptor & | query_mem_desc, | ||
int8_t * | projection_buffer, | ||
const size_t | projection_count, | ||
const int | device_id | ||
) |
For projection queries we only copy back as many elements as necessary, not the whole output buffer. The goal is to be able to build a compact ResultSet, particularly useful for columnar outputs.
NOTE: Saman: we should revisit this function when we have a bump allocator
Definition at line 293 of file GpuMemUtils.cpp.
References align_to_int64(), CHECK, GpuGroupByBuffers::data, QueryMemoryDescriptor::didOutputColumnar(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), getQueryEngineCudaStreamForDevice(), QueryMemoryDescriptor::getSlotCount(), and heavyai::Projection.
Referenced by QueryMemoryInitializer::compactProjectionBuffersGpu().
void copy_to_nvidia_gpu | ( | Data_Namespace::DataMgr * | data_mgr, |
CUdeviceptr | dst, | ||
const void * | src, | ||
const size_t | num_bytes, | ||
const int | device_id | ||
) |
Definition at line 35 of file GpuMemUtils.cpp.
References CHECK, checkCudaErrors(), Data_Namespace::DataMgr::getCudaMgr(), and getQueryEngineCudaStreamForDevice().
Referenced by TreeModelPredictionMgr::createKernelBuffers(), StringDictionaryTranslationMgr::createKernelBuffers(), and anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr().
GpuGroupByBuffers create_dev_group_by_buffers | ( | DeviceAllocator * | device_allocator, |
const std::vector< int64_t * > & | group_by_buffers, | ||
const QueryMemoryDescriptor & | , | ||
const unsigned | block_size_x, | ||
const unsigned | grid_size_x, | ||
const int | device_id, | ||
const ExecutorDispatchMode | dispatch_mode, | ||
const int64_t | num_input_rows, | ||
const bool | prepend_index_buffer, | ||
const bool | always_init_group_by_on_host, | ||
const bool | use_bump_allocator, | ||
const bool | has_varlen_output, | ||
Allocator * | insitu_allocator | ||
) |
Definition at line 70 of file GpuMemUtils.cpp.
References align_to_int64(), Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK, CHECK_GT, CHECK_LE, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyToDevice(), g_bump_allocator_step_reduction, g_max_memory_allocation_size, g_min_memory_allocation_size, QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, logger::INFO, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), LOG, QueryMemoryDescriptor::threadsShareMemory(), to_string(), QueryMemoryDescriptor::varlenOutputBufferElemSize(), and logger::WARNING.
Referenced by QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(), and ResultSet::radixSortOnGpu().
size_t get_num_allocated_rows_from_gpu | ( | DeviceAllocator & | device_allocator, |
int8_t * | projection_size_gpu, | ||
const int | device_id | ||
) |
Returns back total number of allocated rows per device (i.e., number of matched elements in projections).
TODO(Saman): revisit this for bump allocators
Definition at line 277 of file GpuMemUtils.cpp.
References CHECK, and DeviceAllocator::copyFromDevice().
Referenced by QueryExecutionContext::launchGpuCode().