diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index 4500c16036..e34eaa8785 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -750,7 +750,7 @@ bool Buffer::create() { "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed"); return false; } - volatile hsa_signal_value_t* signalValuePtr; + volatile hsa_signal_value_t* signalValuePtr = nullptr; if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) { ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed"); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index c2a988d1fa..2ff4c3483b 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -146,15 +146,13 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) { ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) { return false; } + amd::ScopedLock sl(ts->gpu()->execution()); + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n", + static_cast(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0); + // Update the batch, since signal is complete - if (ts->Signals().size() > 0) { - amd::ScopedLock sl(ts->gpu()->execution()); - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n", - static_cast(value), arg, ts->Signals()[0]->signal_.handle); - ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); - } else { - LogError("Error: ROCr handler was called for untracked signal!"); - } + ts->gpu()->updateCommandsState(ts->command().GetBatchHead()); + // Return false, so the callback will not be called again for this signal return false; } @@ -331,6 +329,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->done_ = false; prof_signal->engine_ = engine_; if (ts != 0) { + // Save HSA signal earlier to make sure the possible callback will have a valid + // value for processing + prof_signal->ts_ = ts; + ts->AddProfilingSignal(prof_signal); // If direct dispatch is enabled and the batch head isn't null, then it's a marker and // requires the batch update upon HSA signal completion if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) { @@ -347,8 +349,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( hsa_amd_profiling_async_copy_enable(true); sdma_profiling_ = true; } - prof_signal->ts_ = ts; - ts->AddProfilingSignal(prof_signal); } return prof_signal->signal_; } @@ -1127,7 +1127,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { } // ================================================================================================ -void VirtualGPU::updateCommandsState(amd::Command* list) { +void VirtualGPU::updateCommandsState(amd::Command* list) const { Timestamp* ts = nullptr; amd::Command* current = list; @@ -2802,6 +2802,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<getHostMem(), mem->getSize())) { - if (pinnedMems_.size() > 7) { - pinnedMems_.front()->release(); - pinnedMems_.erase(pinnedMems_.begin()); - } + if (!AMD_DIRECT_DISPATCH) { + if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.erase(pinnedMems_.begin()); + } - // Delay destruction - pinnedMems_.push_back(mem); + // Delay destruction + pinnedMems_.push_back(mem); + } + } else { + mem->release(); } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 74f3b5f0f3..d77390092c 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -36,7 +36,6 @@ class Device; class Memory; class Timestamp; - struct ProfilingSignal : public amd::HeapObject { hsa_signal_t signal_; //!< HSA signal to track profiling information Timestamp* ts_; //!< Timestamp object associated with the signal @@ -98,7 +97,7 @@ class Timestamp : public amd::HeapObject { , gpu_(gpu) , command_(command) {} - virtual ~Timestamp() {} + ~Timestamp() {} uint64_t getStart() { checkGpuTime(); @@ -236,7 +235,7 @@ class VirtualGPU : public device::VirtualDevice { void profilingBegin(amd::Command& command, bool drmProfiling = false); void profilingEnd(amd::Command& command); - void updateCommandsState(amd::Command* list); + void updateCommandsState(amd::Command* list) const; void submitReadMemory(amd::ReadMemoryCommand& cmd); void submitWriteMemory(amd::WriteMemoryCommand& cmd); diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 24764754dd..405383a621 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -213,13 +213,17 @@ bool Event::awaitCompletion() { // ================================================================================================ bool Event::notifyCmdQueue() { HostQueue* queue = command().queue(); - if ((status() > CL_COMPLETE) && - // Don't need to notify any marker with direct dispatch, - // because all markers are blocking + if ((status() > CL_COMPLETE) && (nullptr != queue) && (!AMD_DIRECT_DISPATCH || + // Don't need to notify any marker with direct dispatch, + // because all markers are blocking. ((command().type() != CL_COMMAND_MARKER) && - (command().type() != 0))) && - (nullptr != queue) && !notified_.test_and_set()) { + (command().type() != 0)) || + // Don't need to notify if the current batch is empty, + // because that means the command was processed and extra notification + // will cause a stall on the host. + (queue->GetSubmittionBatch() != nullptr)) && + !notified_.test_and_set()) { // Make sure the queue is draining the enqueued commands. amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); if (command == NULL) {