25 #include <boost/filesystem.hpp>
28 namespace CudaMgr_Namespace {
30 CudaErrorException::CudaErrorException(
CUresult status)
31 : std::runtime_error(
errorMessage(status)), status_(status) {
34 if (status != CUDA_ERROR_DEINITIALIZED) {
36 VLOG(1) << boost::stacktrace::stacktrace();
41 const char* errorString{
nullptr};
42 cuGetErrorString(status, &errorString);
44 ?
"CUDA Error (" +
std::to_string(status) +
"): " + std::string(errorString)
49 : start_gpu_(start_gpu)
50 , min_shared_memory_per_block_for_all_devices(0)
51 , min_num_mps_for_all_devices(0)
52 , device_memory_allocation_map_{std::make_unique<DeviceMemoryAllocationMap>()} {
53 checkError(cuInit(0));
62 fillDeviceProperties();
64 createDeviceContexts();
65 logDeviceProperties();
68 LOG(
INFO) <<
"Warming up the GPU JIT Compiler... (this may take several seconds)";
71 LOG(
INFO) <<
"GPU JIT Compiler initialized.";
74 void CudaMgr::initDeviceGroup() {
75 for (
int device_id = 0; device_id <
device_count_; device_id++) {
88 CHECK(getDeviceMemoryAllocationMap().mapEmpty());
89 device_memory_allocation_map_ =
nullptr;
94 }
catch (
const CudaErrorException& e) {
95 if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
99 LOG(
ERROR) <<
"CUDA Error: " << e.what();
100 }
catch (
const std::runtime_error& e) {
101 LOG(
ERROR) <<
"CUDA Error: " << e.what();
106 return (((buf_size + (granularity - 1)) / granularity) * granularity);
110 CUmemAllocationProp allocation_prop{};
111 allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
112 allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
113 allocation_prop.location.id = device_num;
114 size_t granularity{};
115 checkError(cuMemGetAllocationGranularity(
116 &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
123 checkError(cuCtxSynchronize());
128 const int8_t* host_ptr,
129 const size_t num_bytes,
130 const int device_num,
135 cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
137 checkError(cuMemcpyHtoDAsync(
138 reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
139 checkError(cuStreamSynchronize(cuda_stream));
144 const int8_t* device_ptr,
145 const size_t num_bytes,
148 auto const cu_device_ptr =
reinterpret_cast<CUdeviceptr>(device_ptr);
151 auto const [allocation_base, allocation] =
152 getDeviceMemoryAllocationMap().getAllocation(cu_device_ptr);
153 CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation.size);
157 checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
159 checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
160 checkError(cuStreamSynchronize(cuda_stream));
166 const size_t num_bytes,
167 const int dest_device_num,
168 const int src_device_num,
172 if (src_device_num == dest_device_num) {
175 checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
176 reinterpret_cast<CUdeviceptr>(src_ptr),
179 checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
180 reinterpret_cast<CUdeviceptr>(src_ptr),
183 checkError(cuStreamSynchronize(cuda_stream));
187 checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
189 reinterpret_cast<CUdeviceptr>(src_ptr),
193 checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
195 reinterpret_cast<CUdeviceptr>(src_ptr),
199 checkError(cuStreamSynchronize(cuda_stream));
204 void CudaMgr::loadGpuModuleData(
CUmodule* module,
206 unsigned int num_options,
209 const int device_id)
const {
211 checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
214 void CudaMgr::unloadGpuModuleData(
CUmodule* module,
const int device_id)
const {
219 auto code = cuModuleUnload(*module);
221 if (code != CUDA_ERROR_DEINITIALIZED) {
224 }
catch (
const std::runtime_error& e) {
225 LOG(
ERROR) <<
"CUDA Error: " << e.what();
229 std::vector<CudaMgr::CudaMemoryUsage> CudaMgr::getCudaMemoryUsage() {
230 std::vector<CudaMgr::CudaMemoryUsage> m;
233 checkError(cuCtxGetCurrent(&cnow));
234 for (
int device_num = 0; device_num <
device_count_; ++device_num) {
236 CudaMemoryUsage usage;
237 cuMemGetInfo(&usage.free, &usage.total);
240 cuCtxSetCurrent(cnow);
244 std::string CudaMgr::getCudaMemoryUsageInString() {
245 auto const device_mem_status = getCudaMemoryUsage();
246 std::ostringstream oss;
248 oss <<
"{ \"name\": \"GPU Memory Info\", ";
249 for (
auto& info : device_mem_status) {
250 oss <<
"{\"device_id\": " << device_id++ <<
", \"freeMB:\": " << info.free / 1048576.0
251 <<
", \"totalMB\": " << info.total / 1048576.0 <<
"} ";
257 void CudaMgr::fillDeviceProperties() {
259 cuDriverGetVersion(&gpu_driver_version_);
260 for (
int device_num = 0; device_num <
device_count_; ++device_num) {
267 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
270 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
275 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
279 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
282 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
285 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
288 CU_DEVICE_ATTRIBUTE_WARP_SIZE,
291 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
294 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
297 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
300 CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
303 CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
306 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
309 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
312 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
322 computeMinSharedMemoryPerBlockForAllDevices();
329 checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
330 return reinterpret_cast<int8_t*
>(host_ptr);
334 const int device_num,
335 const bool is_slab) {
340 CUmemGenericAllocationHandle handle{};
344 auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
346 if (status == CUDA_SUCCESS) {
348 CUmemAllocationProp allocation_prop{};
349 allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
350 allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
351 allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
352 allocation_prop.location.id = device_num +
start_gpu_;
353 status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
355 if (status == CUDA_SUCCESS) {
357 status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
359 if (status == CUDA_SUCCESS) {
361 CUmemAccessDesc access_desc{};
362 access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
363 access_desc.location.id = device_num +
start_gpu_;
364 access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
365 status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
370 if (status != CUDA_SUCCESS) {
372 if (device_ptr && handle) {
373 cuMemUnmap(device_ptr, padded_num_bytes);
376 cuMemRelease(handle);
379 cuMemAddressFree(device_ptr, padded_num_bytes);
381 throw CudaErrorException(status);
385 getDeviceMemoryAllocationMap().addAllocation(
386 device_ptr, padded_num_bytes, handle, device_uuid, device_num, is_slab);
388 getDeviceMemoryAllocationMap().notifyMapChanged(device_uuid, is_slab);
389 return reinterpret_cast<int8_t*
>(device_ptr);
396 auto const cu_device_ptr =
reinterpret_cast<CUdeviceptr>(device_ptr);
397 auto allocation = getDeviceMemoryAllocationMap().removeAllocation(cu_device_ptr);
399 auto status_unmap = cuMemUnmap(cu_device_ptr, allocation.size);
400 auto status_release = cuMemRelease(allocation.handle);
401 auto status_free = cuMemAddressFree(cu_device_ptr, allocation.size);
403 checkError(status_unmap);
404 checkError(status_release);
405 checkError(status_free);
407 getDeviceMemoryAllocationMap().notifyMapChanged(allocation.device_uuid,
412 const size_t num_bytes,
413 const int device_num,
415 setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
419 const unsigned char uc,
420 const size_t num_bytes,
421 const int device_num,
425 checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
427 checkError(cuMemsetD8Async(
428 reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
429 checkError(cuStreamSynchronize(cuda_stream));
463 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices()
const {
464 int shared_mem_size =
469 return shared_mem_size;
476 size_t CudaMgr::computeMinNumMPsForAllDevices()
const {
484 void CudaMgr::createDeviceContexts() {
489 if (status != CUDA_SUCCESS) {
493 for (
int destroy_id = 0; destroy_id <= d; ++destroy_id) {
496 }
catch (
const CudaErrorException& e) {
497 LOG(
ERROR) <<
"Failed to destroy CUDA context for device ID " << destroy_id
498 <<
" with " << e.what()
499 <<
". CUDA contexts were being destroyed due to an error creating "
500 "CUDA context for device ID "
501 << d <<
" out of " << device_count_ <<
" (" <<
errorMessage(status)
513 CHECK_LT(device_num, device_count_);
519 checkError(cuCtxGetCurrent(&cnow));
521 throw std::runtime_error(
"no cuda device context");
531 throw std::runtime_error(
"invalid cuda device context");
534 void CudaMgr::logDeviceProperties()
const {
535 LOG(
INFO) <<
"Using " << device_count_ <<
" Gpus.";
544 VLOG(1) <<
"Per device global memory: "
551 VLOG(1) <<
"Shared memory per multiprocessor: "
563 void CudaMgr::checkError(
CUresult status)
const {
564 if (status != CUDA_SUCCESS) {
565 throw CudaErrorException(status);
569 DeviceMemoryAllocationMap& CudaMgr::getDeviceMemoryAllocationMap() {
570 CHECK(device_memory_allocation_map_);
571 return *device_memory_allocation_map_;
574 int CudaMgr::exportHandle(
const uint64_t handle)
const {
576 checkError(cuMemExportToShareableHandle(
577 &fd, handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
584 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
585 const char* env =
nullptr;
587 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
589 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
590 env = CUDA_DEFAULT_PATH;
594 if (env ==
nullptr) {
595 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
596 "CUDA_HOME or CUDA_DIR are not defined";
601 auto cuda_include_dir = env + std::string(
"/include");
602 auto cuda_h_file = cuda_include_dir +
"/cuda.h";
603 if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
604 LOG(
WARNING) <<
"cuda.h does not exist in `" << cuda_include_dir <<
"`. Discarding `"
605 << env <<
"` as CUDA installation path.";
609 return std::string(env);
613 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
614 const char* env =
nullptr;
616 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
618 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
619 env = CUDA_DEFAULT_PATH;
623 if (env ==
nullptr) {
624 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
625 "CUDA_HOME or CUDA_DIR are not defined";
630 auto libdevice_dir = env + std::string(
"/nvvm/libdevice");
631 auto libdevice_bc_file = libdevice_dir +
"/libdevice.10.bc";
632 if (!boost::filesystem::exists(boost::filesystem::path(libdevice_bc_file))) {
633 LOG(
WARNING) <<
"`" << libdevice_bc_file <<
"` does not exist. Discarding `" << env
634 <<
"` as CUDA installation path with libdevice.";
638 return libdevice_dir;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
size_t getGranularity(const int device_num) const
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
unsigned long long CUdeviceptr
void setContext(const int device_num) const
size_t min_shared_memory_per_block_for_all_devices
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
bool isArchMaxwellOrLaterForAll() const
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
std::vector< DeviceProperties > device_properties_
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num, const bool is_slab=false)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
CudaMgr(const int num_gpus, const int start_gpu=0)
bool isArchVoltaOrGreaterForAll() const