diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index e4ff536840..2f2f54f924 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -449,8 +449,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
engine = HwQueueEngine::SdmaRead;
}
- hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
- uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+ auto wait_events = gpu().Barriers().WaitingSignal(engine);
if (isSubwindowRectCopy ) {
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
@@ -458,10 +457,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
// Copy memory line by line
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"[%zx]!\t HSA Asycn Copy Rect wait_event=0x%zx, completion_signal=0x%zx",
- std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+ std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
- &srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active);
+ &srcMem, &offset, &dim, agent, direction, wait_events.size(), &wait_events[0], active);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
@@ -480,12 +479,12 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
// Copy memory line by line
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
- std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+ std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
hsa_status_t status = hsa_amd_memory_async_copy(
(reinterpret_cast
(dst) + dstOffset), dstAgent,
(reinterpret_cast(src) + srcOffset), srcAgent,
- size[0], num_wait_events, wait_event, active);
+ size[0], wait_events.size(), &wait_events[0], active);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
@@ -662,18 +661,17 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
engine = HwQueueEngine::SdmaRead;
}
- hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
- uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
- hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+ auto wait_events = gpu().Barriers().WaitingSignal(engine);
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Use SDMA to transfer the data
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
- std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+ std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
- size[0], num_wait_events, wait_event, active);
+ size[0], wait_events.size(), &wait_events[0], active);
if (status == HSA_STATUS_SUCCESS) {
gpu().addSystemScope();
} else {
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 4e355e4759..fcfb7fd8a5 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -389,9 +389,11 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
}
// ================================================================================================
-hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+std::vector& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
bool explicit_wait = false;
- hsa_signal_t* signal = nullptr;
+ // Rest all current waiting signals
+ waiting_signals_.clear();
+
// Does runtime switch the active engine?
if (engine != engine_) {
// Yes, return the signla from the previous operation for a wait
@@ -412,30 +414,32 @@ hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
}
// Check if a wait is required
if (explicit_wait) {
- ProfilingSignal* prof_signal;
- // Check if there is an external signal
- if (external_signal_ != nullptr) {
- prof_signal = external_signal_;
- external_signal_ = nullptr;
- } else {
- prof_signal = signal_list_[current_id_];
- }
- // Early signal status check
- if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
- const Settings& settings = gpu_.dev().settings();
- // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU
- if (!WaitForSignal(prof_signal->signal_)) {
- if (settings.cpu_wait_for_signal_) {
- // Wait on CPU for completion if requested
- CpuWaitForSignal(prof_signal);
- } else {
- // Return HSA signal for tracking on GPU
- return &prof_signal->signal_;
+ ProfilingSignal** prof_signal;
+ // Add the oldest signal into the tracking for a wait
+ external_signals_.push_back(signal_list_[current_id_]);
+ prof_signal = &external_signals_[0];
+
+ // Validate all signals for the wait and skip already completed
+ for (uint32_t i = 0; i < external_signals_.size(); ++i) {
+ // Early signal status check
+ if (hsa_signal_load_relaxed(prof_signal[i]->signal_) > 0) {
+ const Settings& settings = gpu_.dev().settings();
+ // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU
+ if (!WaitForSignal(prof_signal[i]->signal_)) {
+ if (settings.cpu_wait_for_signal_) {
+ // Wait on CPU for completion if requested
+ CpuWaitForSignal(prof_signal[i]);
+ } else {
+ // Add HSA signal for tracking on GPU
+ waiting_signals_.push_back(prof_signal[i]->signal_);
+ }
}
}
}
+ external_signals_.clear();
}
- return signal;
+ // Return the array of waiting HSA signals
+ return waiting_signals_;
}
// ================================================================================================
@@ -790,18 +794,32 @@ bool VirtualGPU::dispatchGenericAqlPacket(
return true;
}
+// ================================================================================================
+void VirtualGPU::dispatchBlockingWait() {
+ auto wait_signals = Barriers().WaitingSignal();
+ // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
+ if (wait_signals.size() != 0) {
+ for (uint32_t i = 0; i < wait_signals.size(); ++i) {
+ uint32_t j = i % 5;
+ barrier_packet_.dep_signal[j] = wait_signals[i];
+ constexpr bool kSkipSignal = true;
+ // If runtime reached the packet limit or the count limit, then flush the barrier
+ if ((j == 4) || ((i + 1) == wait_signals.size())) {
+ dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
+ barrier_packet_.dep_signal[0] = hsa_signal_t{};
+ barrier_packet_.dep_signal[1] = hsa_signal_t{};
+ barrier_packet_.dep_signal[2] = hsa_signal_t{};
+ barrier_packet_.dep_signal[3] = hsa_signal_t{};
+ barrier_packet_.dep_signal[4] = hsa_signal_t{};
+ }
+ }
+ }
+}
+
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(
hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
- hsa_signal_t* wait = Barriers().WaitingSignal();
- // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
- if (wait != nullptr) {
- barrier_packet_.dep_signal[0] = *wait;
- constexpr bool kSkipSignal = true;
- dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
- } else {
- barrier_packet_.dep_signal[0] = hsa_signal_t{};
- }
+ dispatchBlockingWait();
return dispatchGenericAqlPacket(packet, header, rest, blocking);
}
@@ -851,10 +869,10 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
packet->completion_signal.handle = 0;
if (!skipSignal) {
+ dispatchBlockingWait();
+
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
- hsa_signal_t* wait = Barriers().WaitingSignal();
- packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
@@ -1143,6 +1161,20 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
timestamp_ = new Timestamp(this, command);
timestamp_->start();
}
+
+ if (AMD_DIRECT_DISPATCH) {
+ for (auto it = command.eventWaitList().begin(); it < command.eventWaitList().end(); ++it) {
+ void* hw_event = ((*it)->NotifyEvent() != nullptr) ?
+ (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
+ if (hw_event != nullptr) {
+ Barriers().AddExternalSignal(reinterpret_cast(hw_event));
+ } else if (static_cast(*it)->queue() != command.queue()) {
+ LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
+ } else {
+ // Assume serialization on the same queue...
+ }
+ }
+ }
}
// ================================================================================================
@@ -1157,6 +1189,9 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
timestamp_->end();
}
command.setData(timestamp_);
+ if (AMD_DIRECT_DISPATCH) {
+ command.SetHwEvent(timestamp_->Signals().back());
+ }
timestamp_ = nullptr;
}
}
@@ -1467,9 +1502,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
if (dev().info().hmmSupported_) {
// Initialize signal for the barrier
- hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
- hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
- uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+ auto wait_events = Barriers().WaitingSignal(HwQueueEngine::Unknown);
+ hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
// Find the requested agent for the transfer
hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1478,7 +1512,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
// Initiate a prefetch command
hsa_status_t status = hsa_amd_svm_prefetch_async(
- const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
+ const_cast(cmd.dev_ptr()), cmd.count(), agent,
+ wait_events.size(), &wait_events[0], active);
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
@@ -2785,7 +2820,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
queue->profilingBegin(vcmd);
// Add a dependency into the device queue on the current queue
- queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
+ queue->Barriers().AddExternalSignal(Barriers().GetLastSignal());
if (vcmd.cooperativeGroups()) {
// Initialize GWS if it's cooperative groups launch
@@ -2812,7 +2847,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
queue->releaseGpuMemoryFence(kSkipCpuWait);
// Add a dependency into the current queue on the coop queue
- Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
+ Barriers().AddExternalSignal(queue->Barriers().GetLastSignal());
hasPendingDispatch_ = true;
queue->profilingEnd(vcmd);
@@ -2885,18 +2920,12 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
amd::Command* current = list;
assert(current != nullptr && "Empty batch for processing!");
- // HIP tests expect callbacks processed from another thread, hence force AQL barrier always, so
- // HSA signal callback will process HIP callback asynchronously
- if (list->Callback() != nullptr) {
- hasPendingDispatch_ = true;
- }
+ // Find the last command
while (current->getNext() != nullptr) {
current = current->getNext();
- if (current->Callback() != nullptr) {
- hasPendingDispatch_ = true;
- }
}
-
+ // Always insert a barrier. Some tests rquire async SDMA wait
+ hasPendingDispatch_ = true;
// Enable profiling, so runtime can track TS
profilingBegin(*current);
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index bfe3dceb39..6545cfbff3 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -206,18 +206,18 @@ class VirtualGPU : public device::VirtualDevice {
void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
//! Returns the last submitted signal for a wait
- hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
+ std::vector& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();
- //! Inserts an external signal(submission in another queue) for dependency tracking
- void SetExternalSignal(ProfilingSignal* signal) {
- external_signal_ = signal;
+ //! Adds an external signal(submission in another queue) for dependency tracking
+ void AddExternalSignal(ProfilingSignal* signal) {
+ external_signals_.push_back(signal);
engine_ = HwQueueEngine::External;
}
- //! Inserts an external signal(submission in another queue) for dependency tracking
+ //! Get the last active signal on the queue
ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
private:
@@ -235,10 +235,11 @@ class VirtualGPU : public device::VirtualDevice {
HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
std::vector signal_list_; //!< The pool of all signals for processing
- ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal
size_t current_id_ = 0; //!< Last submitted signal
bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
+ std::vector external_signals_; //!< External signals for a wait in this queue
+ std::vector waiting_signals_; //!< Current waiting signals in this queue
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -354,9 +355,12 @@ class VirtualGPU : public device::VirtualDevice {
void profilerAttach(bool enable = false) { profilerAttached_ = enable; }
- bool isProfilerAttached() { return profilerAttached_; }
+ bool isProfilerAttached() const { return profilerAttached_; }
// } roc OpenCL integration
private:
+ //! Dispatches a barrier with blocking HSA signals
+ void dispatchBlockingWait();
+
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
uint16_t rest, bool blocking = true);
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 4e0cad1caf..b8dfb244ae 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -42,16 +42,22 @@
namespace amd {
+// ================================================================================================
Event::Event(HostQueue& queue)
: callbacks_(NULL),
status_(CL_INT_MAX),
+ hw_event_(nullptr),
+ notify_event_(nullptr),
profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
Agent::shouldPostEventEvents()) {
notified_.clear();
}
-Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED) { notified_.clear(); }
+// ================================================================================================
+Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
+ hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
+// ================================================================================================
Event::~Event() {
CallBackEntry* callback = callbacks_;
while (callback != NULL) {
@@ -61,6 +67,7 @@ Event::~Event() {
}
}
+// ================================================================================================
uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
if (timeStamp == 0) {
timeStamp = Os::timeNanos();
@@ -88,7 +95,7 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
// Global epoch time since the first processed command
uint64_t epoch = 0;
-
+// ================================================================================================
bool Event::setStatus(int32_t status, uint64_t timeStamp) {
assert(status <= CL_QUEUED && "invalid status");
@@ -157,6 +164,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
return true;
}
+// ================================================================================================
bool Event::resetStatus(int32_t status) {
int32_t currentStatus = this->status();
if (currentStatus != CL_COMPLETE) {
@@ -171,6 +179,7 @@ bool Event::resetStatus(int32_t status) {
return true;
}
+// ================================================================================================
bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data) {
assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status");
@@ -193,7 +202,7 @@ bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void*
return true;
}
-
+// ================================================================================================
void Event::processCallbacks(int32_t status) const {
cl_event event = const_cast(as_cl(this));
const int32_t mask = (status > CL_COMPLETE) ? status : CL_COMPLETE;
@@ -212,6 +221,7 @@ void Event::processCallbacks(int32_t status) const {
}
}
+// ================================================================================================
bool Event::awaitCompletion() {
if (status() > CL_COMPLETE) {
// Notifies current command queue about waiting
@@ -219,7 +229,8 @@ bool Event::awaitCompletion() {
return false;
}
- ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status());
+ ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
+ this, status());
auto* queue = command().queue();
if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
while (status() > CL_COMPLETE) {
@@ -262,6 +273,8 @@ bool Event::notifyCmdQueue() {
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
command->enqueue();
command->release();
+ // Save notification, associated with the current event
+ notify_event_ = command;
}
return true;
}
@@ -306,10 +319,10 @@ void Command::enqueue() {
// update will occur later after flush() with a wait
if (AMD_DIRECT_DISPATCH) {
setStatus(CL_QUEUED);
- // The wait should be performed before the lock,
- // otherwise signal handler may have a deadlock, but awaitCompletion() is thread safe itself
+ // Notify all commands about the waiter. Barrier will be sent in order to obtain
+ // HSA signal for a wait on the current queue
std::for_each(eventWaitList().begin(), eventWaitList().end(),
- std::mem_fun(&Command::awaitCompletion));
+ std::mem_fun(&Command::notifyCmdQueue));
// The batch update must be lock protected to avoid a race condition
// when multiple threads submit/flush/update the batch at the same time
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 68c249b1a6..21bce75a09 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -93,6 +93,8 @@ class Event : public RuntimeObject {
std::atomic callbacks_; //!< linked list of callback entries.
std::atomic status_; //!< current execution status.
std::atomic_flag notified_; //!< Command queue was notified
+ void* hw_event_; //!< HW event ID associated with SW event
+ Event* notify_event_; //!< Notify event, which should contain HW signal
protected:
static const EventWaitList nullWaitList;
@@ -210,6 +212,15 @@ class Event : public RuntimeObject {
//! Returns the callback for this event
const CallBackEntry* Callback() const { return callbacks_; }
+
+ // Saves HW event, associated with the current command
+ void SetHwEvent(void* hw_event) { hw_event_ = hw_event; }
+
+ //! Returns HW event, associated with the current command
+ void* HwEvent() const { return hw_event_; }
+
+ //! Returns notify even associated with the current command
+ Event* NotifyEvent() const { return notify_event_; }
};
/*! \brief An operation that is submitted to a command queue.