SWDEV-257787 - Reset active signal if ROCR call failed

- ROCR fails the call for some reason, then the signal will
become invalid and can hang on a wait. The logic will reset the
active signal in such cases

Change-Id: Ia131420200f1bbd7c9a162b8f1b06db8cecf41c6
Этот коммит содержится в:
German Andryeyev
2021-01-21 17:29:34 -05:00
родитель de3e8b092c
Коммит ce2e5eba6b
3 изменённых файлов: 18 добавлений и 0 удалений
+4
Просмотреть файл
@@ -450,6 +450,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
&srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
return false;
}
@@ -473,6 +474,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
size[0], num_wait_events, wait_event, active);
gpu().setLastCommandSDMA(true) ;
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
return false;
}
@@ -656,6 +658,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
if (status == HSA_STATUS_SUCCESS) {
gpu().addSystemScope();
} else {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from host to device failed with code %d", status);
}
@@ -702,6 +705,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
srcAgent, size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
}
+10
Просмотреть файл
@@ -205,6 +205,15 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
}
}
// ================================================================================================
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
// Reset the signal and return
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0);
// Fallback to the previous signal
current_id_ = (current_id_ == 0) ? (signal_list_.size() - 1) : (current_id_ - 1);
}
// ================================================================================================
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params,
size_t& ldsAddress, bool cooperativeGroups) {
Kernel& hsaKernel = const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
@@ -1200,6 +1209,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
Barriers().ResetCurrentSignal();
LogError("hsa_amd_svm_prefetch_async failed");
cmd.setStatus(CL_INVALID_OPERATION);
}
+4
Просмотреть файл
@@ -258,6 +258,9 @@ class VirtualGPU : public device::VirtualDevice {
return hsa_signal_t{};
}
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();
private:
//! Wait for the next active signal
void WaitNext() {
@@ -282,6 +285,7 @@ class VirtualGPU : public device::VirtualDevice {
}
return true;
}
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
size_t current_id_ = 0; //!< Last submitted signal
hsa_agent_t agent_; //!< HSA device agent