20 namespace CudaMgr_Namespace {
43 const int8_t* host_ptr,
44 const size_t num_bytes,
50 const int8_t* device_ptr,
51 const size_t num_bytes,
57 const size_t num_bytes,
58 const int dest_device_num,
59 const int src_device_num,
81 const size_t num_bytes,
87 const unsigned char uc,
88 const size_t num_bytes,
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t getGranularity(const int device_num) const
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
void setContext(const int device_num) const
void freeDeviceMem(int8_t *device_ptr)
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
bool isArchMaxwellOrLaterForAll() const
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
void freePinnedHostMem(int8_t *host_ptr)
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num, const bool is_slab=false)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
device_count_(device_count)
CudaMgr(const int num_gpus, const int start_gpu=0)
bool isArchVoltaOrGreaterForAll() const