22 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
24 gpu_active_modules_device_mask_ |= (1 << device_id);
25 gpu_active_modules_[device_id] = module;
26 VLOG(1) <<
"Registered module " << module <<
" on device " <<
std::to_string(device_id);
32 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
34 if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
37 gpu_active_modules_device_mask_ ^= (1 << device_id);
43 const std::string& interrupt_session) {
44 const auto allow_interrupt =
46 if (allow_interrupt) {
47 bool is_running_query =
false;
52 executor_session_mutex_);
53 if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
54 VLOG(1) <<
"Skip the interrupt request (no query has been submitted from the "
55 "given query session)";
58 if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
59 VLOG(1) <<
"Skip the interrupt request (already interrupted query session)";
64 is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
74 executor_session_mutex_);
75 setQuerySessionAsInterrupted(query_session, session_write_lock);
77 if (!is_running_query) {
81 interrupted_.store(
true);
89 bool CPU_execution_mode =
true;
101 CHECK_GE(cuda_mgr->getDeviceCount(), 1);
102 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
105 for (
int device_id = 0; device_id < max_gpu_count; device_id++) {
106 if (gpu_active_modules_device_mask_ & (1 << device_id)) {
107 void* llvm_module = gpu_active_modules_[device_id];
108 auto cu_module =
static_cast<CUmodule>(llvm_module);
112 VLOG(1) <<
"Try to interrupt the running query on GPU assigned to Executor "
114 CPU_execution_mode =
false;
116 cuda_mgr->setContext(device_id);
121 cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
124 cuEventCreate(&start, 0);
125 cuEventCreate(&stop, 0);
126 cuEventRecord(start, cu_stream1);
130 size_t dw_abort_size;
131 if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module,
"dw_abort") ==
133 CHECK_EQ(dw_abort_size,
sizeof(uint32_t));
134 int32_t abort_val = 1;
136 reinterpret_cast<void*>(&abort_val),
140 if (device_id == 0) {
141 VLOG(1) <<
"GPU: Async Abort submitted to Device "
147 if (allow_interrupt) {
149 size_t runtime_interrupt_flag_size;
150 auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
151 &runtime_interrupt_flag_size,
153 "runtime_interrupt_flag");
154 if (status == CUDA_SUCCESS) {
155 VLOG(1) <<
"Executor " << executor_id_
156 <<
" retrieves interrupt status from GPU " << device_id;
157 CHECK_EQ(runtime_interrupt_flag_size,
sizeof(uint32_t));
158 int32_t abort_val = 1;
160 reinterpret_cast<void*>(&abort_val),
163 if (device_id == 0) {
164 VLOG(1) <<
"GPU: send interrupt signal from Executor " << executor_id_
167 }
else if (status == CUDA_ERROR_NOT_FOUND) {
169 "Runtime query interrupt on Executor " +
std::to_string(executor_id_) +
170 " has failed: an interrupt flag on the GPU could "
171 "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
177 const char* error_ret_str =
nullptr;
178 cuGetErrorName(status, &error_ret_str);
179 if (!error_ret_str) {
180 error_ret_str =
"UNKNOWN";
182 std::string error_str(error_ret_str);
187 "(CUDA_ERROR_CODE: " +
191 cuEventRecord(stop, cu_stream1);
192 cuEventSynchronize(stop);
193 float milliseconds = 0;
194 cuEventElapsedTime(&milliseconds, start, stop);
196 <<
": submitted async interrupt request from Executor " << executor_id_
209 if (allow_interrupt && CPU_execution_mode) {
211 VLOG(1) <<
"Try to interrupt the running query on CPU from Executor " << executor_id_;
217 const auto allow_interrupt =
221 }
else if (allow_interrupt) {
223 for (
int device_id = 0; device_id < max_gpu_count; device_id++) {
227 VLOG(1) <<
"Reset interrupt flag for CPU execution kernel on Executor "
232 if (interrupted_.load()) {
233 VLOG(1) <<
"RESET Executor " << executor_id_
234 <<
" that had previously been interrupted";
235 interrupted_.store(
false);
static void registerActiveModule(void *module, const int device_id)
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
bool g_enable_dynamic_watchdog
bool g_enable_non_kernel_time_query_interrupt
std::shared_lock< T > shared_lock
static void unregisterActiveModule(const int device_id)
std::unique_lock< T > unique_lock
__device__ int32_t runtime_interrupt_flag
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
void interrupt(const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
__device__ int32_t dw_abort
bool g_enable_runtime_query_interrupt