diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index e16b40ec37..d25e8e2f4f 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
hsa_dim3_t offset = { 0, 0 ,0 };
- if ((srcRect.rowPitch_ % 4 != 0) ||
- (srcRect.slicePitch_ % 4 != 0) ||
- (dstRect.rowPitch_ % 4 != 0) ||
+ if ((srcRect.rowPitch_ % 4 != 0) ||
+ (srcRect.slicePitch_ % 4 != 0) ||
+ (dstRect.rowPitch_ % 4 != 0) ||
(dstRect.slicePitch_ % 4 != 0)) {
isSubwindowRectCopy = false;
}
+ HwQueueEngine engine = HwQueueEngine::Unknown;
+ if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+ (dstAgent.handle != dev().getCpuAgent().handle)) {
+ engine = HwQueueEngine::SdmaWrite;
+ } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+ (dstAgent.handle == dev().getCpuAgent().handle)) {
+ engine = HwQueueEngine::SdmaRead;
+ }
+
+ hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+ uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+
if (isSubwindowRectCopy ) {
- hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
- uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
- hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
// Copy memory line by line
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
@@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
} else {
// Fall to line by line copies
const hsa_signal_value_t kInitVal = size[2] * size[1];
- hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
- uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
- hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
@@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
(reinterpret_cast
(dst) + dstOffset), dstAgent,
(reinterpret_cast(src) + srcOffset), srcAgent,
size[0], num_wait_events, wait_event, active);
- gpu().setLastCommandSDMA(true) ;
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
return false;
+ } else {
+ gpu().setLastCommandSDMA(true);
}
}
}
}
}
- // Explicit wait for now, until runtime could distinguish compute and sdma operations
- gpu().Barriers().WaitCurrent();
+
return true;
}
@@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
srcAgent = dstAgent = dev().getBackendDevice();
}
- hsa_signal_t wait = gpu().Barriers().WaitSignal();
- hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
- uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
- hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+ HwQueueEngine engine = HwQueueEngine::Unknown;
+ if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+ (dstAgent.handle != dev().getCpuAgent().handle)) {
+ engine = HwQueueEngine::SdmaWrite;
+ } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+ (dstAgent.handle == dev().getCpuAgent().handle)) {
+ engine = HwQueueEngine::SdmaRead;
+ }
+
+ hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+ uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Use SDMA to transfer the data
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
size[0], num_wait_events, wait_event, active);
- gpu().setLastCommandSDMA(true);
- // Explicit wait for now, until runtime could distinguish compute and sdma operations
- gpu().Barriers().WaitCurrent();
if (status == HSA_STATUS_SUCCESS) {
+ gpu().setLastCommandSDMA(true);
gpu().addSystemScope();
} else {
gpu().Barriers().ResetCurrentSignal();
@@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, dev().settings().stagedXferSize_);
- hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Host to Device
if (hostToDev) {
@@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
const hsa_agent_t srcAgent =
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
+ HwQueueEngine engine = HwQueueEngine::Unknown;
+ if (srcAgent.handle == dev().getBackendDevice().handle) {
+ engine = HwQueueEngine::SdmaWrite;
+ }
+ gpu().Barriers().SetActiveEngine(engine);
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
memcpy(hsaBuffer, hostSrc + offset, size);
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
srcAgent, size, 0, nullptr, active);
- gpu().setLastCommandSDMA(true);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
+ } else {
+ gpu().setLastCommandSDMA(true);
}
gpu().Barriers().WaitCurrent();
totalSize -= size;
@@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
const hsa_agent_t dstAgent =
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
+ HwQueueEngine engine = HwQueueEngine::Unknown;
+ if (dstAgent.handle == dev().getBackendDevice().handle) {
+ engine = HwQueueEngine::SdmaRead;
+ }
+ gpu().Barriers().SetActiveEngine(engine);
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
// Copy data from Device to Host
status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
dev().getBackendDevice(), size, 0, nullptr, active);
- gpu().setLastCommandSDMA(true);
if (status == HSA_STATUS_SUCCESS) {
+ gpu().setLastCommandSDMA(true);
gpu().Barriers().WaitCurrent();
memcpy(hostDst + offset, hsaBuffer, size);
} else {
+ gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from device to host failed with code %d", status);
return false;
}
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index 5f344ff1af..a81480f21d 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -40,9 +40,6 @@ class Kernel;
class Memory;
class VirtualGPU;
-constexpr bool kSkipCpuWait = true;
-constexpr bool kIgnoreBarrier = false;
-
//! DMA Blit Manager
class DmaBlitManager : public device::HostBlitManager {
public:
diff --git a/projects/clr/rocclr/device/rocm/rocdefs.hpp b/projects/clr/rocclr/device/rocm/rocdefs.hpp
index 28284d76e0..44827de45d 100644
--- a/projects/clr/rocclr/device/rocm/rocdefs.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdefs.hpp
@@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32;
//! Set to match the number of pipes, which is 8.
static constexpr uint kMaxAsyncQueues = 8;
+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
enum HwQueueEngine : uint32_t {
Compute = 0,
SdmaRead = 1,
SdmaWrite = 2,
- Unknown = 3
+ Unknown = 3,
+ External = 4
};
} // namespace roc
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index 4222061abb..e2f62210f5 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -28,6 +28,7 @@
namespace roc {
+// ================================================================================================
Settings::Settings() {
// Initialize the HSA device default settings
@@ -91,8 +92,15 @@ Settings::Settings() {
rocr_backend_ = true;
barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
+
+ cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH;
+ cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
+ ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
+ system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
+ skip_copy_sync_ = ROC_SKIP_COPY_SYNC;
}
+// ================================================================================================
bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK,
bool coop_groups) {
customHostAllocator_ = false;
@@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
return true;
}
+// ================================================================================================
void Settings::override() {
// Limit reported workgroup size
if (GPU_MAX_WORKGROUP_SIZE != 0) {
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index e0e29c0d27..5abace8b85 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -52,7 +52,10 @@ class Settings : public device::Settings {
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint barrier_sync_ : 1; //!< Use AQL barrier command to sync with CPU
- uint reserved_ : 23;
+ uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU
+ uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
+ uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality
+ uint reserved_ : 20;
};
uint value_;
};
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 7340d99e70..a4a6257bb2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -35,10 +35,12 @@
#include "amd_hsa_kernel_code.h"
#include
-#include
-#include
#include
+#include
+#include
#include
+#include
+
/**
* HSA image object size in bytes (see HSAIL spec)
@@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
return (v >> pos) & ((1 << width) - 1);
};
+// ================================================================================================
+void Timestamp::checkGpuTime() {
+ if (HwProfiling()) {
+ uint64_t start = std::numeric_limits::max();
+ uint64_t end = 0;
+
+ for (auto it : signals_) {
+ if (hsa_signal_load_relaxed(it->signal_) > 0) {
+ WaitForSignal(it->signal_);
+ }
+ hsa_amd_profiling_dispatch_time_t time = {};
+ if (it->engine_ == HwQueueEngine::Compute) {
+ hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
+ } else {
+ hsa_amd_profiling_async_copy_time_t time_sdma = {};
+ hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
+ time.start = time_sdma.start;
+ time.end = time_sdma.end;
+ }
+ start = std::min(time.start, start);
+ end = std::max(time.end, end);
+ it->ts_ = nullptr;
+ it->done_ = true;
+ }
+ signals_.clear();
+ start_ = start * ticksToTime_;
+ end_ = end * ticksToTime_;
+ }
+}
+
+// ================================================================================================
bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
if (numMemObj > 0) {
// Allocate the array of memory objects for dependency tracking
@@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
return true;
}
+// ================================================================================================
void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
bool flushL1Cache = false;
@@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
numMemObjectsInQueue_++;
}
+// ================================================================================================
void VirtualGPU::MemoryDependency::clear(bool all) {
if (numMemObjectsInQueue_ > 0) {
size_t i, j;
@@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
}
}
+// ================================================================================================
+VirtualGPU::HwQueueTracker::~HwQueueTracker() {
+ for (auto& signal: signal_list_) {
+ if (signal->signal_.handle != 0) {
+ hsa_signal_destroy(signal->signal_);
+ }
+ delete signal;
+ }
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::Create() {
+ constexpr size_t kSignalListSize = 16;
+ signal_list_.resize(kSignalListSize);
+
+ hsa_agent_t agent = gpu_.gpu_device();
+ const Settings& settings = gpu_.dev().settings();
+ hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+ uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+ for (uint i = 0; i < kSignalListSize; ++i) {
+ std::unique_ptr signal(new ProfilingSignal());
+ if ((signal == nullptr) ||
+ (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
+ return false;
+ }
+ signal_list_[i] = signal.release();
+ }
+ return true;
+}
+
+// ================================================================================================
+hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
+ hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) {
+ // If queue size grows, then add more signals to avoid more frequent stalls
+ if (queue_size > signal_list_.size()) {
+ std::unique_ptr signal(new ProfilingSignal());
+ if (signal != nullptr) {
+ hsa_agent_t agent = gpu_.gpu_device();
+ const Settings& settings = gpu_.dev().settings();
+ hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+ uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+ if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+ signal_list_.push_back(signal.release());
+ }
+ }
+ }
+ // Find valid index
+ ++current_id_ %= signal_list_.size();
+
+ // Make sure the previous operation on the current signal is done
+ WaitCurrent();
+
+ // Have to wait the next signal in the queue to avoid a race condition between
+ // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+ WaitNext();
+
+ // Reset the signal and return
+ hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+ signal_list_[current_id_]->done_ = false;
+ signal_list_[current_id_]->engine_ = engine_;
+ if (ts != 0) {
+ if (!sdma_profiling_) {
+ hsa_amd_profiling_async_copy_enable(true);
+ sdma_profiling_ = true;
+ }
+ signal_list_[current_id_]->ts_ = ts;
+ ts->AddProfilingSignal(signal_list_[current_id_]);
+ }
+ return signal_list_[current_id_]->signal_;
+}
+
+// ================================================================================================
+hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+ bool explicit_wait = false;
+ hsa_signal_t* signal = nullptr;
+ // Does runtime switch the active engine?
+ if (engine != engine_) {
+ // Yes, return the signla from the previous operation for a wait
+ engine_ = engine;
+ explicit_wait = true;
+ } else {
+ // Unknown engine in use, hence return a wait signal always
+ if (engine == HwQueueEngine::Unknown) {
+ explicit_wait = true;
+ } else {
+ // Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
+ // and ignore signal wait, relying on in-order engine execution
+ const Settings& settings = gpu_.dev().settings();
+ if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
+ explicit_wait = true;
+ }
+ }
+ }
+ // Check if a wait is required
+ if (explicit_wait) {
+ ProfilingSignal* prof_signal;
+ // Check if there is an external signal
+ if (external_signal_ != nullptr) {
+ prof_signal = external_signal_;
+ external_signal_ = nullptr;
+ } else {
+ prof_signal = signal_list_[current_id_];
+ }
+ // Early signal status check
+ if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
+ const Settings& settings = gpu_.dev().settings();
+ // Wait on CPU if requested
+ if (settings.cpu_wait_for_signal_) {
+ CpuWaitForSignal(prof_signal);
+ } else {
+ return &prof_signal->signal_;
+ }
+ }
+ }
+ return signal;
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
+ // Wait for the current signal
+ if (!signal->done_) {
+ // Update timestamp values if requested
+ if (signal->ts_ != nullptr) {
+ signal->ts_->checkGpuTime();
+ } else {
+ if (!WaitForSignal(signal->signal_)) {
+ LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
+ return false;
+ }
+ signal->done_ = true;
+ }
+ }
+ return true;
+}
+
// ================================================================================================
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
// Reset the signal and return
@@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(
hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
+ hsa_signal_t* wait = Barriers().WaitingSignal();
+ // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
+ if (wait != nullptr) {
+ barrier_packet_.dep_signal[0] = *wait;
+ constexpr bool kSkipSignal = true;
+ dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
+ } else {
+ barrier_packet_.dep_signal[0] = hsa_signal_t{};
+ }
+
return dispatchGenericAqlPacket(packet, header, rest, blocking);
}
@@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
if (!skipSignal) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
+ hsa_signal_t* wait = Barriers().WaitingSignal();
+ packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
+
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
pool_size);
@@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerParam_(nullptr),
schedulerQueue_(nullptr),
schedulerSignal_({0}),
+ barriers_(*this),
cuMask_(cuMask),
priority_(priority),
copy_command_type_(0)
@@ -804,7 +990,7 @@ bool VirtualGPU::create() {
}
// Allocate signal tracker for ROCr copy queue
- if (!Barriers().Create(gpu_device())) {
+ if (!Barriers().Create()) {
LogError("Could not create signal for copy queue!");
return false;
}
@@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
return;
}
// Without barrier profiling will wait for each individual signal
- timestamp_ = new Timestamp();
+ timestamp_ = new Timestamp(dev().getBackendDevice());
timestamp_->start();
}
}
@@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
#if AMD_HMM_SUPPORT
profilingBegin(cmd);
// Initialize signal for the barrier
- hsa_signal_t wait = Barriers().WaitSignal();
- hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
- uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
- hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+ hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
+ hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
+ uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
// Find the requested agent for the transfer
hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
hsa_status_t status = hsa_amd_svm_prefetch_async(
const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
- // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+ // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
Barriers().ResetCurrentSignal();
LogError("hsa_amd_svm_prefetch_async failed");
@@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
return true;
}
+
/**
* @brief Api to dispatch a kernel for execution. The implementation
* parses the input object, an instance of virtual command to obtain
@@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
* It also parses the kernel arguments buffer to inject into Hsa Runtime
* the list of kernel parameters.
*/
+ // ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
// Wait for the execution on the current queue, since the coop groups will use the device queue
- releaseGpuMemoryFence();
+ releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
// Get device queue for exclusive GPU access
VirtualGPU* queue = dev().xferQueue();
@@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
queue->profilingBegin(vcmd);
+ // Add a dependency into the device queue on the current queue
+ queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
+
if (vcmd.cooperativeGroups()) {
// Initialize GWS if it's cooperative groups launch
uint32_t workgroups = 0;
@@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
// Wait for the execution on the device queue. Keep the current queue in-order
- queue->releaseGpuMemoryFence();
+ queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
+ // Add a dependency into the current queue on the coop queue
+ Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
+ hasPendingDispatch_ = true;
queue->profilingEnd(vcmd);
} else {
@@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
}
}
+// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"< 0) ? true : false; }
+ const bool HwProfiling() const { return !signals_.empty(); }
- void setAgent(hsa_agent_t agent) { agent_ = agent; }
-
- Timestamp()
+ Timestamp(hsa_agent_t agent)
: start_(std::numeric_limits::max())
- , end_(0) {
- agent_.handle = 0;
- }
+ , end_(0)
+ , agent_(agent) {}
~Timestamp() {}
//! Finds execution ticks on GPU
- void checkGpuTime() {
- if (HwProfiling()) {
- hsa_amd_profiling_dispatch_time_t time = {};
-
- uint64_t start = std::numeric_limits::max();
- uint64_t end = 0;
- for (auto it : signals_) {
- if (hsa_signal_load_relaxed(it->signal_) > 0) {
- WaitForSignal(it->signal_);
- }
- hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
- if ((time.end - time.start) == 0) {
- hsa_amd_profiling_async_copy_time_t time_sdma = {};
- hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
- time.start = time_sdma.start;
- time.end = time_sdma.end;
- }
- start = std::min(time.start, start);
- end = std::max(time.end, end);
- it->ts_ = nullptr;
- it->done_ = true;
- }
- signals_.clear();
- start_ = start * ticksToTime_;
- end_ = end * ticksToTime_;
- }
- }
+ void checkGpuTime();
// Start a timestamp (get timestamp from OS)
void start() { start_ = amd::Os::timeNanos(); }
@@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice {
class HwQueueTracker : public amd::EmbeddedObject {
public:
- HwQueueTracker() {}
+ HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
- ~HwQueueTracker() {
- for (auto& signal: signal_list_) {
- if (signal->signal_.handle != 0) {
- hsa_signal_destroy(signal->signal_);
- }
- delete signal;
- }
- }
+ ~HwQueueTracker();
//! Creates a pool of signals for tracking of HW operations on the queue
- bool Create(hsa_agent_t agent) {
- constexpr size_t kSignalListSize = 16;
- signal_list_.resize(kSignalListSize);
- for (uint i = 0; i < kSignalListSize; ++i) {
- ProfilingSignal* signal = new ProfilingSignal();
- if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
- 0, 1, &agent, &signal->signal_))) {
- return false;
- }
- signal_list_[i] = signal;
- }
- agent_ = agent;
- return true;
- }
+ bool Create();
//! Finds a free signal for the upcomming operation
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
- Timestamp* ts = nullptr, uint32_t queue_size = 0) {
- // If queue size grows, then add more signals to avoid more frequent stalls
- if (queue_size > signal_list_.size()) {
- ProfilingSignal* signal = new ProfilingSignal();
- if (signal != nullptr) {
- if (HSA_STATUS_SUCCESS == hsa_signal_create(
- 0, 1, &agent_, &signal->signal_)) {
- signal_list_.push_back(signal);
- }
- }
- }
- // Find valid index
- ++current_id_ %= signal_list_.size();
-
- // Make sure the previous operation on the current signal is done
- WaitCurrent();
-
- // Have to wait the next signal in the queue to avoid a race condition between
- // a GPU waiter(which may be not triggered yet) and CPU signal reset below
- WaitNext();
-
- // Reset the signal and return
- hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
- signal_list_[current_id_]->done_ = false;
- if (ts != 0) {
- if (!sdma_profiling_) {
- hsa_amd_profiling_async_copy_enable(true);
- sdma_profiling_ = true;
- }
- signal_list_[current_id_]->ts_ = ts;
- ts->AddProfilingSignal(signal_list_[current_id_]);
- ts->setAgent(agent_);
- }
- return signal_list_[current_id_]->signal_;
- }
+ Timestamp* ts = nullptr, uint32_t queue_size = 0);
//! Wait for the curent active signal. Can idle the queue
- bool WaitCurrent() { return WaitIndex(current_id_); }
+ bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); }
+
+ //! Update current active engine
+ void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
//! Returns the last submitted signal for a wait
- hsa_signal_t WaitSignal() {
- //! @note Currently wait on CPU unconditionally to avoid a negative performance impact
- WaitCurrent();
- return hsa_signal_t{};
- }
+ hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();
- private:
+ //! Inserts an external signal(submission in another queue) for dependency tracking
+ void SetExternalSignal(ProfilingSignal* signal) {
+ external_signal_ = signal;
+ engine_ = HwQueueEngine::External;
+ }
+
+ //! Inserts an external signal(submission in another queue) for dependency tracking
+ ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
+
+ private:
//! Wait for the next active signal
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
- WaitIndex(next);
+ CpuWaitForSignal(signal_list_[next]);
}
//! Wait for the provided signal
- bool WaitIndex(size_t index) {
- // Wait for the current signal
- if (!signal_list_[index]->done_) {
- // Update timestamp values if requested
- if (signal_list_[index]->ts_ != nullptr) {
- signal_list_[index]->ts_->checkGpuTime();
- } else {
- if (!WaitForSignal(signal_list_[index]->signal_)) {
- LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
- return false;
- }
- signal_list_[index]->done_ = true;
- }
- }
- return true;
- }
+ bool CpuWaitForSignal(ProfilingSignal* signal);
- std::vector signal_list_; //!< The pool of all signals for processing
- size_t current_id_ = 0; //!< Last submitted signal
- hsa_agent_t agent_; //!< HSA device agent
- bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
+ HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
+ std::vector signal_list_; //!< The pool of all signals for processing
+ ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal
+ size_t current_id_ = 0; //!< Last submitted signal
+ bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
+ const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice {
*/
bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
- hsa_agent_t gpu_device() { return gpu_device_; }
+ hsa_agent_t gpu_device() const { return gpu_device_; }
hsa_queue_t* gpu_queue() { return gpu_queue_; }
// Return pointer to PrintfDbg
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index ed9828778d..fe1b139f13 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false, \
"Forces unconditional active wait for GPU") \
release(bool, ROC_ENABLE_LARGE_BAR, true, \
"Enable Large Bar if supported by the device") \
+release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
+ "Enable CPU wait for dependent HSA signals.") \
+release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
+ "Enable system scope for signals (uses interrupts).") \
+release(bool, ROC_SKIP_COPY_SYNC, false, \
+ "Skips copy syncs if runtime can predict the same engine.") \
release(bool, HIP_FORCE_QUEUE_PROFILING, false, \
"Force command queue profiling by default") \
release(uint, PAL_FORCE_ASIC_REVISION, 0, \