diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 4db1dc77cc..9373749772 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -359,14 +359,9 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { } bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false; auto hip_stream = hip::getStream(stream, wait); - bool wait_for_cpu = false; - // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands - if (DEBUG_HIP_BLOCK_SYNC != 0) { - static std::atomic flush = 0; - wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false; - } + // Wait for the current host queue - hip_stream->finish(wait_for_cpu); + hip_stream->finish(); // Release freed memory for all memory pools on the device hip_stream->GetDevice()->ReleaseFreedMemory(); return hipSuccess; diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 99ba26d4a9..933ef7cd49 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1332,6 +1332,9 @@ class VirtualDevice : public amd::HeapObject { const std::string& kernelName, amd::AccumulateCommand* vcmd = nullptr) = 0; + //! Returns the number of outstanding HSA async handlers + std::atomic& QueuedAsyncHandlers() const { return queued_async_handlers_; } + private: //! Disable default copy constructor VirtualDevice& operator=(const VirtualDevice&); @@ -1347,6 +1350,7 @@ class VirtualDevice : public amd::HeapObject { amd::Monitor execution_; //!< Lock to serialise access to all device objects uint index_; //!< The virtual device unique index + mutable std::atomic queued_async_handlers_ = 0; //!< Outstanding HSA async handlers }; } // namespace amd::device diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 23a2fd22d9..edbabd0ca2 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -232,11 +232,14 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { // Save callback signal hsa_signal_t callback_signal = ts->GetCallbackSignal(); + auto gpu = ts->gpu(); + gpu->QueuedAsyncHandlers()--; + // Reset last used SDMA engine mask - ts->gpu()->setLastUsedSdmaEngine(0); + gpu->setLastUsedSdmaEngine(0); // Update the batch, since signal is complete - ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); + gpu->updateCommandsState(ts->command().GetBatchHead()); // Reset API callback signal. It will release AQL queue and start commands processing if (callback_signal.handle != 0) { @@ -474,6 +477,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( hsa_signal_add_relaxed(prof_signal->signal_, 1); init_value += 1; } + gpu_.QueuedAsyncHandlers()++; hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_, HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts); if (HSA_STATUS_SUCCESS != result) { diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 49f3029c36..bcdb649c3b 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -141,6 +141,11 @@ void HostQueue::finish(bool cpu_wait) { "Can't claim the queue is finished with the active batch!"); return; } + // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands + if ((DEBUG_HIP_BLOCK_SYNC > 0) && + (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) { + cpu_wait = true; + } } // Force marker if the batch wasn't sent for CPU update or fence is dirty if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {