28 cuEventCreate(&start_, 0);
29 cuEventCreate(&stop_, 0);
31 void start()
override { cuEventRecord(start_, 0); }
33 cuEventRecord(stop_, 0);
34 cuEventSynchronize(stop_);
36 cuEventElapsedTime(&ms, start_, stop_);
41 CUevent start_, stop_;
49 name_ = cuda_ctx->name(device_id);
50 const auto native_code = cuda_ctx->getNativeCode(device_id);
51 function_ptr =
static_cast<CUfunction>(native_code.first);
52 module_ptr =
static_cast<CUmodule>(native_code.second);
55 void launch(
unsigned int gridDimX,
56 unsigned int gridDimY,
57 unsigned int gridDimZ,
58 unsigned int blockDimX,
59 unsigned int blockDimY,
60 unsigned int blockDimZ,
61 unsigned int sharedMemBytes,
63 bool optimize_block_and_grid_sizes)
override {
65 if (optimize_block_and_grid_sizes) {
66 int recommended_block_size;
67 int recommended_grid_size;
68 std::ostringstream oss;
69 checkCudaErrors(cuOccupancyMaxPotentialBlockSize(&recommended_grid_size,
70 &recommended_block_size,
75 if (static_cast<unsigned int>(recommended_block_size) != blockDimX) {
76 VLOG(1) <<
"Apply a recommended CUDA block size: " << recommended_block_size
77 <<
" (current: " << blockDimX <<
")";
78 blockDimX = recommended_block_size;
80 if (static_cast<unsigned int>(recommended_grid_size) != gridDimX) {
81 VLOG(1) <<
"Apply a recommended CUDA grid size: " << recommended_grid_size
82 <<
" (current: " << gridDimX <<
")";
83 gridDimX = recommended_grid_size;
86 VLOG(1) <<
"Launch GPU kernel compiled with the following block and grid sizes: "
87 << blockDimX <<
" and " << gridDimX;
105 cuEventCreate(&start, 0);
106 cuEventCreate(&stop, 0);
107 cuEventRecord(start, 0);
110 size_t dw_cycle_budget_size;
112 if (device_id == 0) {
113 LOG(
INFO) <<
"Dynamic Watchdog budget: GPU: "
119 &dw_cycle_budget, &dw_cycle_budget_size, module_ptr,
"dw_cycle_budget"));
120 CHECK_EQ(dw_cycle_budget_size,
sizeof(uint64_t));
122 reinterpret_cast<void*>(&cycle_budget),
128 size_t dw_sm_cycle_start_size;
130 &dw_sm_cycle_start, &dw_sm_cycle_start_size, module_ptr,
"dw_sm_cycle_start"));
131 CHECK_EQ(dw_sm_cycle_start_size, 128 *
sizeof(uint64_t));
132 checkCudaErrors(cuMemsetD32Async(dw_sm_cycle_start, 0, 128 * 2, qe_cuda_stream));
135 if (!could_interrupt) {
139 size_t dw_abort_size;
141 cuModuleGetGlobal(&dw_abort, &dw_abort_size, module_ptr,
"dw_abort"));
142 CHECK_EQ(dw_abort_size,
sizeof(uint32_t));
147 cuEventRecord(stop, 0);
148 cuEventSynchronize(stop);
149 float milliseconds = 0;
150 cuEventElapsedTime(&milliseconds, start, stop);
152 <<
": launchGpuCode: dynamic watchdog init: " <<
std::to_string(milliseconds)
159 cuEventCreate(&start, 0);
160 cuEventCreate(&stop, 0);
161 cuEventRecord(start, 0);
164 size_t runtime_interrupt_flag_size;
166 &runtime_interrupt_flag_size,
168 "runtime_interrupt_flag"));
169 CHECK_EQ(runtime_interrupt_flag_size,
sizeof(uint32_t));
171 checkCudaErrors(cuMemsetD32Async(runtime_interrupt_flag, 0, 1, qe_cuda_stream));
174 cuEventRecord(stop, 0);
175 cuEventSynchronize(stop);
176 float milliseconds = 0;
177 cuEventElapsedTime(&milliseconds, start, stop);
179 <<
": launchGpuCode: runtime query interrupter init: "
188 std::unique_ptr<DeviceClock>
make_clock()
override {
189 return std::make_unique<CudaEventClock>();
192 char const*
name()
const override {
return name_.c_str(); }
205 return std::make_unique<NvidiaKernel>(ctx, device_id);
virtual char const * name() const =0
static void registerActiveModule(void *module, const int device_id)
__device__ int64_t dw_sm_cycle_start[128]
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
__device__ int64_t dw_cycle_budget
virtual void initializeRuntimeInterrupter(const int device_id)=0
virtual std::unique_ptr< DeviceClock > make_clock()=0
virtual void launch(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, void **kernelParams, bool optimize_block_and_grid_sizes)=0
static void unregisterActiveModule(const int device_id)
__device__ int32_t runtime_interrupt_flag
virtual void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget)=0
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
virtual void resetRuntimeInterrupter(const int device_id)=0
unsigned g_dynamic_watchdog_time_limit
__device__ int32_t dw_abort