SWDEV-257787 - Reset active signal if ROCR call failed
- ROCR fails the call for some reason, then the signal will become invalid and can hang on a wait. The logic will reset the active signal in such cases Change-Id: Ia131420200f1bbd7c9a162b8f1b06db8cecf41c6
Этот коммит содержится в:
@@ -450,6 +450,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
|
||||
&srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("DMA buffer failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
@@ -473,6 +474,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
size[0], num_wait_events, wait_event, active);
|
||||
gpu().setLastCommandSDMA(true) ;
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("DMA buffer failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
@@ -656,6 +658,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().addSystemScope();
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
}
|
||||
|
||||
@@ -702,6 +705,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
srcAgent, size, 0, nullptr, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -205,6 +205,15 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
|
||||
// Reset the signal and return
|
||||
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0);
|
||||
// Fallback to the previous signal
|
||||
current_id_ = (current_id_ == 0) ? (signal_list_.size() - 1) : (current_id_ - 1);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params,
|
||||
size_t& ldsAddress, bool cooperativeGroups) {
|
||||
Kernel& hsaKernel = const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
|
||||
@@ -1200,6 +1209,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
|
||||
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
|
||||
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
|
||||
Barriers().ResetCurrentSignal();
|
||||
LogError("hsa_amd_svm_prefetch_async failed");
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -258,6 +258,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
return hsa_signal_t{};
|
||||
}
|
||||
|
||||
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
|
||||
void ResetCurrentSignal();
|
||||
|
||||
private:
|
||||
//! Wait for the next active signal
|
||||
void WaitNext() {
|
||||
@@ -282,6 +285,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
|
||||
size_t current_id_ = 0; //!< Last submitted signal
|
||||
hsa_agent_t agent_; //!< HSA device agent
|
||||
|
||||
Ссылка в новой задаче
Block a user