From 45b48fb9876220bbd728ddf2559a7ec38b5d247b Mon Sep 17 00:00:00 2001 From: lancesix <98881381+lancesix@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:32:04 +0100 Subject: [PATCH] SWDEV-555043 - Do not wait on signal if gpu in error state (#1023) During a process tear-down we wait on all signals before releasing them: VirtualGPU::HwQueueTracker::~HwQueueTracker() { for (auto& signal : signal_list_) { CpuWaitForSignal(signal); signal->release(); } [...] } In the case where we exit the process after a GPU error that did not cause an abort (ulimit -c == 0), waiting for the signal can be skipped. With the device on the error state, no progress is made, and the signal is probably never going to be modified again: inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yield = false) { [...] if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { ClPrint(amd::LOG_ERROR, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" "(0x%lx) for %d ns", signal.handle, kTimeout4Secs); return true; } [...] } However, after calling CpuWaitForSignal, when calling "release", we can end-up on a signal dtor which also tries to wait on the signal. Because the GPU is the error state, we never receive the signal, and hang the process during tear down. This happens with the ProfilingSignal dtor: ProfilingSignal::~ProfilingSignal() { if (signal_.handle != 0) { if (hsa_signal_load_relaxed(signal_) > 0) { LogError("Runtime shouldn't destroy a signal that is still busy!"); if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) { } } hsa_signal_destroy(signal_); } } This dtor should check that the GPU is not in the error state before trying to wait, which is what this patch implements. Bug: SWDEV-555043 Bug: SWDEV-553435 Bug: SWDEV-553679 Bug: SWDEV-555119 --- projects/clr/rocclr/device/rocm/rocdevice.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 55708d8a31..7114688604 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -3513,7 +3513,8 @@ void Device::RemoveKernel(Kernel& gpuKernel) const { // ================================================================================================ ProfilingSignal::~ProfilingSignal() { if (signal_.handle != 0) { - if (hsa_signal_load_relaxed(signal_) > 0) { + if (hsa_signal_load_relaxed(signal_) > 0 + && !(HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError())) { LogError("Runtime shouldn't destroy a signal that is still busy!"); if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {