35 namespace CudaMgr_Namespace {
49 class CudaErrorException :
public std::runtime_error {
53 CUresult getStatus()
const {
return status_; }
86 CudaMgr(
const int num_gpus,
const int start_gpu = 0);
97 const int8_t* host_ptr,
98 const size_t num_bytes,
102 const int8_t* device_ptr,
103 const size_t num_bytes,
107 const size_t num_bytes,
108 const int dest_device_num,
109 const int src_device_num,
114 const int device_num,
115 const bool is_slab =
false);
119 const size_t num_bytes,
120 const int device_num,
123 const unsigned char uc,
124 const size_t num_bytes,
125 const int device_num,
143 throw std::runtime_error(
"Specified device number " +
std::to_string(device_num) +
144 " is out of range of number of devices (" +
178 LOG(
WARNING) <<
"Unrecognized Nvidia device architecture, falling back to "
179 "Kepler-compatibility.";
189 switch (device_properties.computeMajor) {
197 if (device_properties.computeMinor < 5) {
218 void logDeviceProperties()
const;
220 const std::vector<CUcontext>& getDeviceContexts()
const {
223 const int getGpuDriverVersion()
const {
224 return gpu_driver_version_;
227 void loadGpuModuleData(
CUmodule* module,
229 unsigned int num_options,
231 void** option_values,
232 const int device_id)
const;
233 void unloadGpuModuleData(
CUmodule* module,
const int device_id)
const;
235 struct CudaMemoryUsage {
240 std::vector<CudaMgr::CudaMemoryUsage> getCudaMemoryUsage();
242 std::string getCudaMemoryUsageInString();
244 DeviceMemoryAllocationMap& getDeviceMemoryAllocationMap();
245 int exportHandle(
const uint64_t handle)
const;
251 void fillDeviceProperties();
252 void initDeviceGroup();
253 void createDeviceContexts();
254 size_t computeMinSharedMemoryPerBlockForAllDevices()
const;
255 size_t computeMinNumMPsForAllDevices()
const;
256 void checkError(
CUresult cu_result)
const;
258 int gpu_driver_version_;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
std::vector< DeviceIdentifier > DeviceGroup
size_t getGranularity(const int device_num) const
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
void setContext(const int device_num) const
bool isArchPascalOrLater() const
size_t min_shared_memory_per_block_for_all_devices
size_t getMinNumMPsForAllDevices() const
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
int getDeviceCount() const
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t getMinSharedMemoryPerBlockForAllDevices() const
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
size_t allocationGranularity
bool isArchMaxwellOrLaterForAll() const
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
std::vector< DeviceProperties > device_properties_
void freePinnedHostMem(int8_t *host_ptr)
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num, const bool is_slab=false)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
bool isArchMaxwell() const
const heavyai::DeviceGroup & getDeviceGroup() const
bool isArchPascal() const
CudaMgr(const int num_gpus, const int start_gpu=0)
std::unique_ptr< DeviceMemoryAllocationMap > DeviceMemoryAllocationMapUqPtr
const std::vector< DeviceProperties > & getAllDeviceProperties() const
bool isArchVoltaOrGreaterForAll() const
NvidiaDeviceArch getDeviceArch() const
bool isArchMaxwellOrLater() const