diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index 320c8885fc..e983af887a 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- gpu().releaseGpuMemoryFence();
+ // HSA copy functionality with a possible async operation
+ gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+ // Stall GPU before CPU access
+ gpu().Barriers().WaitCurrent();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
} else {
size_t srcSize = size[0];
@@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
const amd::Coord3D& size, bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+ // HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+ // Stall GPU before CPU access
+ gpu().Barriers().WaitCurrent();
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
} else {
Memory& xferBuf = dev().xferRead().acquire();
@@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+ // HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
if (setup_.disableReadImage_) {
@@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- gpu().releaseGpuMemoryFence();
-
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
} else {
+ // HSA copy functionality with a possible async operation
+ gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
size_t dstSize = size[0];
size_t tmpSize = 0;
size_t offset = 0;
@@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect, const amd::Coord3D& size,
bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+ // HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
@@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
size_t rowPitch, size_t slicePitch, bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+ // HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
if (setup_.disableWriteImage_) {
@@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- gpu().releaseGpuMemoryFence();
-
if (setup_.disableCopyBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
(dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
} else {
return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size);
@@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
const amd::Coord3D& size, bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- gpu().releaseGpuMemoryFence();
-
if (setup_.disableCopyBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
dstMemory.isHostMemDirectAccess())) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
} else {
+ gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
void* src = gpuMem(srcMemory).getDeviceMemory();
void* dst = gpuMem(dstMemory).getDeviceMemory();
@@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
if (isSubwindowRectCopy ) {
- hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
+ hsa_signal_t wait = gpu().Barriers().WaitSignal();
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy memory line by line
- hsa_status_t status =
- hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent,
- direction, 0, nullptr, completion_signal_);
+ hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
+ &srcMem, &offset, &dim, agent, direction, 1, &wait, active);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("DMA buffer failed with code %d", status);
return false;
}
-
- if (!WaitForSignal(completion_signal_)) {
- LogError("Async copy failed");
- return false;
- }
} else {
// Fall to line by line copies
const hsa_signal_value_t kInitVal = size[2] * size[1];
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
+ hsa_signal_t wait = gpu().Barriers().WaitSignal();
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
@@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
size_t dstOffset = dstRect.offset(0, y, z);
// Copy memory line by line
- hsa_status_t status =
- hsa_amd_memory_async_copy((reinterpret_cast
(dst) + dstOffset), dstAgent,
- (reinterpret_cast(src) + srcOffset), srcAgent,
- size[0], 0, nullptr, completion_signal_);
+ hsa_status_t status = hsa_amd_memory_async_copy(
+ (reinterpret_cast(dst) + dstOffset), dstAgent,
+ (reinterpret_cast(src) + srcOffset), srcAgent,
+ size[0], 1, &wait, active);
gpu().setLastCommandSDMA(true) ;
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("DMA buffer failed with code %d", status);
@@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
}
}
-
- if (!WaitForSignal(completion_signal_)) {
- LogError("Async copy failed");
- return false;
- }
}
-
}
+ // Explicit wait for now, until runtime could distinguish compute and sdma operations
+ gpu().Barriers().WaitCurrent();
return true;
}
@@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
- gpu().releaseGpuMemoryFence(true);
- } else {
- gpu().releaseGpuMemoryFence();
- }
+ // HSA copy functionality with a possible async operation, hence make sure GPU is done
+ bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+ gpu().releaseGpuMemoryFence(force_barrier);
bool result = false;
@@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
} else {
Image& srcImage = static_cast(srcMemory);
Buffer& dstBuffer = static_cast(dstMemory);
-
- // Use ROC path for a transfer
- // Note: it doesn't support SDMA
address dstHost = reinterpret_cast(dstBuffer.getDeviceMemory()) + dstOrigin[0];
// Use ROCm path for a transfer.
@@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
- if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
- gpu().releaseGpuMemoryFence(true);
- } else {
- gpu().releaseGpuMemoryFence();
- }
+ // HSA copy functionality with a possible async operation, hence make sure GPU is done
+ bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+ gpu().releaseGpuMemoryFence(force_barrier);
bool result = false;
@@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
- // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+ // HSA copy functionality with a possible async operation, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
bool result = false;
@@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
address src = reinterpret_cast(srcMemory.getDeviceMemory());
address dst = reinterpret_cast(dstMemory.getDeviceMemory());
- if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
- gpu().releaseGpuMemoryFence(true);
- }
+ bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+ gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait);
src += srcOrigin[0];
dst += dstOrigin[0];
@@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
// Just call copy function for full profile
hsa_status_t status;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
+ // Stall GPU, sicne CPU copy is possible
+ gpu().Barriers().WaitCurrent();
status = hsa_memory_copy(dst, src, size[0]);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy of data failed with code %d", status);
@@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
srcAgent = dstAgent = dev().getBackendDevice();
}
- hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
-
+ hsa_signal_t wait = gpu().Barriers().WaitSignal();
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Use SDMA to transfer the data
- status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
- completion_signal_);
+ status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active);
gpu().setLastCommandSDMA(true);
+ // Explicit wait for now, until runtime could distinguish compute and sdma operations
+ gpu().Barriers().WaitCurrent();
if (status == HSA_STATUS_SUCCESS) {
- hsa_signal_value_t val;
-
- if (!WaitForSignal(completion_signal_)) {
- LogError("Async copy failed");
- status = HSA_STATUS_ERROR;
- } else {
- gpu().addSystemScope();
- }
+ gpu().addSystemScope();
} else {
LogPrintfError("Hsa copy from host to device failed with code %d", status);
}
@@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
// ================================================================================================
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
address staging, bool hostToDev) const {
+ // Stall GPU, sicne CPU copy is possible
+ bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+ gpu().releaseGpuMemoryFence(force_barrier);
+
// No allocation is necessary for Full Profile
hsa_status_t status;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
@@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
size_t offset = 0;
address hsaBuffer = staging;
- if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
- gpu().releaseGpuMemoryFence(true);
- }
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, dev().settings().stagedXferSize_);
- hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
+ hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Host to Device
if (hostToDev) {
@@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
memcpy(hsaBuffer, hostSrc + offset, size);
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
- srcAgent, size, 0, nullptr, completion_signal_);
+ srcAgent, size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
- if (status == HSA_STATUS_SUCCESS) {
- if (!WaitForSignal(completion_signal_)) {
- LogError("Async copy failed");
- return false;
- }
- } else {
+ if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
}
+ gpu().Barriers().WaitCurrent();
totalSize -= size;
offset += size;
continue;
@@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
// Copy data from Device to Host
- status =
- hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
- dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
+ status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
+ dev().getBackendDevice(), size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status == HSA_STATUS_SUCCESS) {
- if (!WaitForSignal(completion_signal_)) {
- LogError("Async copy failed");
- return false;
- }
+ gpu().Barriers().WaitCurrent();
memcpy(hostDst + offset, hsaBuffer, size);
} else {
LogPrintfError("Hsa copy from device to host failed with code %d", status);
@@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
releaseArguments(parameters);
if (releaseView) {
// todo SRD programming could be changed to avoid a stall
- if(!dev().settings().barrier_sync_) {
- gpu().releaseGpuMemoryFence(true);
- } else {
- gpu().releaseGpuMemoryFence();
- }
+ gpu().releaseGpuMemoryFence();
dstView->owner()->release();
}
@@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
releaseArguments(parameters);
if (releaseView) {
// todo SRD programming could be changed to avoid a stall
- if(!dev().settings().barrier_sync_) {
- gpu().releaseGpuMemoryFence(true);
- } else {
- gpu().releaseGpuMemoryFence();
- }
+ gpu().releaseGpuMemoryFence();
srcView->owner()->release();
}
@@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
// Use host copy if memory has direct access
if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
synchronize();
return result;
@@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
synchronize();
return result;
@@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
synchronize();
return result;
@@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
// Use host copy if memory has direct access
if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
return result;
@@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
// Use host fill if memory has direct access
if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
synchronize();
return result;
@@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
// Use host fill if memory has direct access
if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) {
+ // Stall GPU before CPU access
+ gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
synchronize();
return result;
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index c4f4b89ba9..5f344ff1af 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -40,6 +40,9 @@ class Kernel;
class Memory;
class VirtualGPU;
+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
//! DMA Blit Manager
class DmaBlitManager : public device::HostBlitManager {
public:
@@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager {
);
//! Destructor
- virtual ~DmaBlitManager() {
- if (completion_signal_.handle != 0) {
- hsa_signal_destroy(completion_signal_);
- }
- }
+ virtual ~DmaBlitManager() {}
//! Creates DmaBlitManager object
- virtual bool create(amd::Device& device) {
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) {
- return false;
- }
- return true;
- }
+ virtual bool create(amd::Device& device) { return true; }
//! Copies a buffer object to system memory
virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object
@@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager {
size_t xferSize //!< Transfer size
) const;
- //! Handle of ROC Device object
- hsa_signal_t completion_signal_;
-
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 7ed5ffc5e6..a19987e3c2 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -260,7 +260,12 @@ Device::~Device() {
context().svmFree(coopHostcallBuffer_);
coopHostcallBuffer_ = nullptr;
}
+
+ if (0 != prefetch_signal_.handle) {
+ hsa_signal_destroy(prefetch_signal_);
+ }
}
+
bool NullDevice::initCompiler(bool isOffline) {
#if defined(WITH_COMPILER_LIB)
// Initialize the compiler handle if has already not been initialized
diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp
index 2f20890cee..9c1055d140 100644
--- a/projects/clr/rocclr/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp
@@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
amd::Coord3D(size()), true)) {
LogError("[OCL] Fail sync the device memory on cpuUnmap");
}
+ // Wait on CPU for the transfer
+ static_cast(vDev).releaseGpuMemoryFence();
}
-
decIndMapCount();
}
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index bb9cfdfacf..bc65024fcf 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
- hsa_signal_t signal;
- // TODO: placeholder to setup the kernel to populate start and end timestamp.
if (timestamp_ != nullptr) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
- if (pool_size >= signal_pool_.size()) {
- ProfilingSignal profiling_signal = {};
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
- LogPrintfError("Failed signal allocation id = %d", pool_size);
- return false;
- }
- signal_pool_.push_back(profiling_signal);
- assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
- }
- // Move index inside the valid pool
- ++current_signal_ %= signal_pool_.size();
- // Find signal slot
- ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
- // Make sure we save the old results in the TS structure
- if (profilingSignal->ts_ != nullptr) {
- profilingSignal->ts_->checkGpuTime();
- }
- // Update the new TS with the signal info
- timestamp_->setProfilingSignal(profilingSignal);
- packet->completion_signal = profilingSignal->signal_;
- profilingSignal->ts_ = timestamp_;
- timestamp_->setAgent(gpu_device_);
+ // Get active signal for current dispatch if profiling is necessary
+ packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
}
// Make sure the slot is free for usage
@@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Add blocking command if the original value of read index was behind of the queue size
if (blocking || (index - read) >= queueMask) {
if (packet->completion_signal.handle == 0) {
- packet->completion_signal = barrier_signal_;
+ packet->completion_signal = Barriers().ActiveSignal();
}
- signal = packet->completion_signal;
- // Initialize signal for a wait
- hsa_signal_store_relaxed(signal, kInitSignalValueOne);
blocking = true;
}
- // If runtime doesn't use the barrier, then make sure it tracks the last submitted command
- if (!dev().settings().barrier_sync_) {
- // Initialize signal for a wait
- assert(packet->completion_signal.handle != 0 &&
- "There is no HSA signal associated with the last command!");
- hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne);
- last_signal_ = packet->completion_signal;
- }
-
// Insert packet(s)
// NOTE: need multiple packets to dispatch the performance counter
// packet blob of the legacy devices (gfx8)
@@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Wait on signal ?
if (blocking) {
- if (!WaitForSignal(signal)) {
- LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
+ if (!Barriers().WaitCurrent()) {
+ LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle);
return false;
}
- // Reset the pool of signals
- current_signal_ = 0;
}
return true;
@@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
return false;
}
+
+// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
assert(packet->completion_signal.handle != 0);
const uint32_t queueSize = gpu_queue_->size;
@@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal);
}
+// ================================================================================================
void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
uint16_t packetHeader, hsa_signal_t signal) {
const uint32_t queueSize = gpu_queue_->size;
@@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
if (signal.handle == 0) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
- if (pool_size >= signal_pool_.size()) {
- ProfilingSignal profiling_signal = {};
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
- LogPrintfError("Failed signal allocation id = %d", pool_size);
- }
- signal_pool_.push_back(profiling_signal);
- assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
- }
- // Move index inside the valid pool
- ++current_signal_ %= signal_pool_.size();
- // Find signal slot
- ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
- // Make sure we save the old results in the TS structure
- if (profilingSignal->ts_ != nullptr) {
- profilingSignal->ts_->checkGpuTime();
- }
- if (timestamp_ != nullptr) {
- // Update the new TS with the signal info
- timestamp_->setProfilingSignal(profilingSignal);
- profilingSignal->ts_ = timestamp_;
- timestamp_->setAgent(gpu_device_);
- }
- packet->completion_signal = profilingSignal->signal_;
- hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne);
+ // Get active signal for current dispatch if profiling is necessary
+ packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
} else {
assert(signal.handle != 0);
packet->completion_signal = signal;
@@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() {
// Release the pool, since runtime just completed a barrier
// @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
resetKernArgPool();
- } else {
- // Reset the pool of signals
- current_signal_ = 0;
}
}
// ================================================================================================
-bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) {
- // Return if there is no pending dispatch
- if (!hasPendingDispatch_) {
- if (dev().settings().barrier_sync_ || !force_barrier) {
- return false;
- }
- }
- hsa_signal_t wait_signal = barrier_signal_;
+bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) {
+ if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) {
+ barrier_packet_.completion_signal = Barriers().ActiveSignal();
- // If barrier sync was requested or runtime didn't provide the last signal
- if (dev().settings().barrier_sync_ || force_barrier) {
- // Initialize signal for the barrier packet.
- hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
-
- // Dispatch barrier packet into the queue and wait till it finishes.
+ // Dispatch barrier packet into the queue
dispatchBarrierPacket(&barrier_packet_);
- }
- else {
- // Take the signal of the last submitted dispatch
- wait_signal = last_signal_;
+ hasPendingDispatch_ = false;
}
- // Wait for compute work previously submitted
- if (!WaitForSignal(wait_signal)) {
- LogError("Waiting for compute work failed!");
- return false;
+ // Check if runtime could skip CPU wait
+ if (!skip_cpu_wait) {
+ Barriers().WaitCurrent();
+
+ ResetQueueStates();
}
-
- hasPendingDispatch_ = false;
-
- ResetQueueStates();
return true;
}
@@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
}
aqlHeader_ = dispatchPacketHeader_;
- barrier_signal_.handle = 0;
// Note: Virtual GPU device creation must be a thread safe operation
roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
@@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
}
+// ================================================================================================
VirtualGPU::~VirtualGPU() {
delete blitMgr_;
// Release the resources of signal
releaseGpuMemoryFence();
- if (barrier_signal_.handle != 0) {
- hsa_signal_destroy(barrier_signal_);
- }
-
destroyPool();
releasePinnedMem();
@@ -868,8 +790,7 @@ bool VirtualGPU::create() {
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
if (!gpu_queue_) return false;
- if (!initPool(dev().settings().kernargPoolSize_,
- (profiling_ || (amd::IS_HIP)) ? queue_size : 0)) {
+ if (!initPool(dev().settings().kernargPoolSize_)) {
LogError("Couldn't allocate arguments/signals for the queue");
return false;
}
@@ -881,17 +802,9 @@ bool VirtualGPU::create() {
return false;
}
- // Create signal for the barrier packet.
- hsa_signal_t signal = {0};
- if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) {
- return false;
- }
- barrier_signal_ = signal;
-
// Initialize barrier packet.
memset(&barrier_packet_, 0, sizeof(barrier_packet_));
barrier_packet_.header = kInvalidAql;
- barrier_packet_.completion_signal = barrier_signal_;
// Create a object of PrintfDbg
printfdbg_ = new PrintfDbg(roc_device_);
@@ -912,62 +825,32 @@ bool VirtualGPU::create() {
return false;
}
+ // Allocate signal tracker for ROCr copy queue
+ if (!Barriers().Create(gpu_device())) {
+ LogError("Could not create signal for copy queue!");
+ return false;
+ }
return true;
}
-bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
+// ================================================================================================
+bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, false));
if (kernarg_pool_base_ == nullptr) {
return false;
}
-
- // Optimization :
- // For better resource utilization runtime should create them only when required
- // In case of HIP, Apps create short live streams which do not need more signals
- // hence starting with smaller number 32. There is code inplace to grow the pool
- // later when it is needed.
- bool forced_default_pool_sz = false;
- if (!profiling_ && (amd::IS_HIP)) {
- forced_default_pool_sz = true;
- }
-
- if (signal_pool_count != 0) {
- // Reserve signal pool for all entries in the queue, since profiling logic will save the
- // pointer in timestamp info for the future references
- signal_pool_.reserve(signal_pool_count);
- // If barrier is disable, then allocate a small portion of all signals and grow the array later.
- // @note: the optimization requires a wait for signal on reuse, which is only available when
- // the barrier is disabled
- constexpr uint32_t kDefaultSignalPoolSize = 32;
- const uint32_t default_signal_pool_size =
- (dev().settings().barrier_sync_ && !forced_default_pool_sz) ?
- signal_pool_count : kDefaultSignalPoolSize;
- signal_pool_.resize(default_signal_pool_size);
- for (uint i = 0; i < default_signal_pool_size; ++i) {
- ProfilingSignal profilingSignal;
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
- return false;
- }
- signal_pool_[i] = profilingSignal;
- }
- }
-
return true;
}
+// ================================================================================================
void VirtualGPU::destroyPool() {
if (kernarg_pool_base_ != nullptr) {
roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
}
-
- if (signal_pool_.size() > 0) {
- for (uint i = 0; i < signal_pool_.size(); ++i) {
- hsa_signal_destroy(signal_pool_[i].signal_);
- }
- }
}
+// ================================================================================================
void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
char* result = nullptr;
do {
@@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
//! We can issue a barrier to avoid expensive extra memory allocations.
// Initialize signal for the barrier packet.
- hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+ barrier_packet_.completion_signal = Barriers().ActiveSignal();
// Dispatch barrier packet into the queue and wait till it finishes.
dispatchBarrierPacket(&barrier_packet_);
- if (!WaitForSignal(barrier_signal_)) {
+ if (!Barriers().WaitCurrent()) {
LogError("Kernel arguments reset failed");
}
-
resetKernArgPool();
-
- // Reset the pool of signals
- current_signal_ = 0;
}
} while (true);
return result;
}
+// ================================================================================================
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
* virtualgpu's timestamp_, and calls start() to get the current host
* timestamp.
@@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
if (command.profilingInfo().enabled_) {
if (timestamp_ != nullptr) {
- LogWarning(
- "Trying to create a second timestamp in VirtualGPU. \
- This could have unintended consequences.");
+ LogWarning("Trying to create a second timestamp in VirtualGPU. \
+ This could have unintended consequences.");
return;
}
// Without barrier profiling will wait for each individual signal
@@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
}
}
+// ================================================================================================
/* profilingEnd, when profiling is enabled, checks to see if a signal was
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available. It then saves the pointer
@@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
}
}
+// ================================================================================================
void VirtualGPU::updateCommandsState(amd::Command* list) {
Timestamp* ts = nullptr;
@@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
// ================================================================================================
void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
#if AMD_HMM_SUPPORT
+ profilingBegin(cmd);
// Initialize signal for the barrier
- hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+ hsa_signal_t wait = Barriers().WaitSignal();
+ hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
// Find the requested agent for the transfer
hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
// Initiate a prefetch command
hsa_status_t status = hsa_amd_svm_prefetch_async(
- const_cast(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_);
+ const_cast(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active);
- // Wait for the prefetch
- if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) {
+ // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+ if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
LogError("hsa_amd_svm_prefetch_async failed");
cmd.setStatus(CL_INVALID_OPERATION);
}
// Add system scope, since the prefetch scope is unclear
addSystemScope();
+
+ profilingEnd(cmd);
#endif // AMD_HMM_SUPPORT
}
@@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
if (gpuKernel.dynamicParallelism()) {
+ barrier_packet_.completion_signal.handle = 0;
dispatchBarrierPacket(&barrier_packet_);
static_cast(blitMgr()).runScheduler(
- getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
+ getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
}
// Check if image buffer write back is required
@@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
uint16_t header = kNopPacketHeader;
hsa_signal_t sig { 0 };
dispatchGenericBarrierPacket(&barrier_packet_, header, sig);
- last_signal_ = barrier_packet_.completion_signal;
- // Restore barrier signal
- barrier_packet_.completion_signal = barrier_signal_;
}
profilingEnd(vcmd);
}
@@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
profilingBegin(vcmd);
if (!dev().settings().barrier_sync_) {
// Force barrier to make sure L2 flush, since interop can be in sysmem
- constexpr bool ForceBarrier = true;
- releaseGpuMemoryFence(ForceBarrier);
+ constexpr bool kForceBarrier = true;
+ releaseGpuMemoryFence(kForceBarrier);
}
profilingEnd(vcmd);
}
@@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
// ================================================================================================
void VirtualGPU::addXferWrite(Memory& memory) {
+ //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+ //! unconditionally, before it can release pinned memory
+ releaseGpuMemoryFence();
if (xferWriteBuffers_.size() > 7) {
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
@@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) {
xferWriteBuffers_.push_back(&memory);
}
+// ================================================================================================
void VirtualGPU::releaseXferWrite() {
for (auto& memory : xferWriteBuffers_) {
dev().xferWrite().release(*this, *memory);
@@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() {
xferWriteBuffers_.resize(0);
}
+// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
+ //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+ //! unconditionally, before it can release pinned memory
+ releaseGpuMemoryFence();
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
@@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
}
}
+// ================================================================================================
void VirtualGPU::releasePinnedMem() {
for (auto& amdMemory : pinnedMems_) {
amdMemory->release();
@@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() {
pinnedMems_.resize(0);
}
+// ================================================================================================
amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
for (auto& amdMemory : pinnedMems_) {
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 7f8b93f49d..413159cf3f 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -36,10 +36,11 @@ class Memory;
class Timestamp;
struct ProfilingSignal : public amd::HeapObject {
- hsa_signal_t signal_; //!< HSA signal to track profiling information
- Timestamp* ts_; //!< Timestamp object associated with the signal
+ hsa_signal_t signal_; //!< HSA signal to track profiling information
+ Timestamp* ts_; //!< Timestamp object associated with the signal
+ bool done_; //!< True if signal is done
- ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; }
+ ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; }
};
// Initial HSA signal value
@@ -111,13 +112,19 @@ class Timestamp {
hsa_amd_profiling_dispatch_time_t time;
if (splittedDispatch_) {
- uint64_t start = UINT64_MAX;
+ uint64_t start = std::numeric_limits::max();
uint64_t end = 0;
for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) {
WaitForSignal(*it);
}
hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
+ if ((time.end - time.start) == 0) {
+ hsa_amd_profiling_async_copy_time_t time_sdma = {};
+ hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+ time.start = time_sdma.start;
+ time.end = time_sdma.end;
+ }
if (time.start < start) {
start = time.start;
}
@@ -133,10 +140,18 @@ class Timestamp {
WaitForSignal(profilingSignal_->signal_);
}
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
- start_ = time.start * ticksToTime_;
- end_ = time.end * ticksToTime_;
+ if ((time.end - time.start) == 0) {
+ hsa_amd_profiling_async_copy_time_t time_sdma = {};
+ hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+ start_ = time_sdma.start * ticksToTime_;
+ end_ = time_sdma.end * ticksToTime_;
+ } else {
+ start_ = time.start * ticksToTime_;
+ end_ = time.end * ticksToTime_;
+ }
}
profilingSignal_->ts_ = nullptr;
+ profilingSignal_->done_ = true;
profilingSignal_ = nullptr;
}
}
@@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice {
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
+ class HwQueueTracker : public amd::EmbeddedObject {
+ public:
+ HwQueueTracker() {}
+
+ ~HwQueueTracker() {
+ for (auto& signal: signal_list_) {
+ if (signal->signal_.handle != 0) {
+ hsa_signal_destroy(signal->signal_);
+ }
+ delete signal;
+ }
+ }
+
+ //! Creates a pool of signals for tracking of HW operations on the queue
+ bool Create(hsa_agent_t agent) {
+ constexpr size_t kSignalListSize = 16;
+ signal_list_.resize(kSignalListSize);
+ for (uint i = 0; i < kSignalListSize; ++i) {
+ ProfilingSignal* signal = new ProfilingSignal();
+ if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
+ 0, 1, &agent, &signal->signal_))) {
+ return false;
+ }
+ signal_list_[i] = signal;
+ }
+ agent_ = agent;
+ return true;
+ }
+
+ //! Finds a free signal for the upcomming operation
+ hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
+ Timestamp* ts = nullptr, uint32_t queue_size = 0) {
+ // If queue size grows, then add more signals to avoid more frequent stalls
+ if (queue_size > signal_list_.size()) {
+ ProfilingSignal* signal = new ProfilingSignal();
+ if (signal != nullptr) {
+ if (HSA_STATUS_SUCCESS == hsa_signal_create(
+ 0, 1, &agent_, &signal->signal_)) {
+ signal_list_.push_back(signal);
+ }
+ }
+ }
+ // Find valid index
+ ++current_id_ %= signal_list_.size();
+
+ // Make sure the previous operation on the current signal is done
+ WaitCurrent();
+
+ // Have to wait the next signal in the queue to avoid a race condition between
+ // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+ WaitNext();
+
+ // Reset the signal and return
+ hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+ signal_list_[current_id_]->done_ = false;
+ if (ts != 0) {
+ if (!sdma_profiling_) {
+ hsa_amd_profiling_async_copy_enable(true);
+ sdma_profiling_ = true;
+ }
+ signal_list_[current_id_]->ts_ = ts;
+ ts->setProfilingSignal(signal_list_[current_id_]);
+ ts->setAgent(agent_);
+ }
+ return signal_list_[current_id_]->signal_;
+ }
+
+ //! Wait for the curent active signal. Can idle the queue
+ bool WaitCurrent() { return WaitIndex(current_id_); }
+
+ //! Returns the last submitted signal for a wait
+ hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; }
+
+ private:
+ //! Wait for the next active signal
+ void WaitNext() {
+ size_t next = (current_id_ + 1) % signal_list_.size();
+ WaitIndex(next);
+ }
+
+ //! Wait for the provided signal
+ bool WaitIndex(size_t index) {
+ // Wait for the current signal
+ if (!signal_list_[index]->done_) {
+ // Update timestamp values if requested
+ if (signal_list_[index]->ts_ != nullptr) {
+ signal_list_[index]->ts_->checkGpuTime();
+ } else {
+ if (!WaitForSignal(signal_list_[index]->signal_)) {
+ LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
+ return false;
+ }
+ signal_list_[index]->done_ = true;
+ }
+ }
+ return true;
+ }
+ std::vector signal_list_; //!< The pool of all signals for processing
+ size_t current_id_ = 0; //!< Last submitted signal
+ hsa_agent_t agent_; //!< HSA device agent
+ bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
+ };
+
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
const std::vector& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
@@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice {
*
* @return bool true if Wait returned successfully, false otherwise
*/
- bool releaseGpuMemoryFence(bool force_barrier = false);
+ bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
hsa_agent_t gpu_device() { return gpu_device_; }
hsa_queue_t* gpu_queue() { return gpu_queue_; }
@@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice {
void addSystemScope() { addSystemScope_ = true; }
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
+ HwQueueTracker& Barriers() { return barriers_; }
+
+ Timestamp* timestamp() const { return timestamp_; }
+
// } roc OpenCL integration
private:
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
@@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice {
void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
amd::NDRangeContainer& sizes);
- bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
+ bool initPool(size_t kernarg_pool_size);
void destroyPool();
void* allocKernArg(size_t size, size_t alignment);
@@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice {
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql
uint32_t isLastCommandSDMA_ : 1; //!< Keep track if the last command was SDMA and
- //!< not send Barrier packets if barrier_sync is 0
+ //!< not send Barrier packets if barrier_sync is 0
};
uint32_t state_;
};
@@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice {
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
- hsa_signal_t barrier_signal_;
- hsa_signal_t last_signal_ = {}; //!< Last submitted signal
+
uint32_t dispatch_id_; //!< This variable must be updated atomically.
Device& roc_device_; //!< roc device object
PrintfDbg* printfdbg_;
@@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice {
hsa_queue_t* schedulerQueue_;
hsa_signal_t schedulerSignal_;
+ HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
+
char* kernarg_pool_base_;
size_t kernarg_pool_size_;
uint kernarg_pool_cur_offset_;
- std::vector signal_pool_; //!< Pool of signals for profiling
- uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool
friend class Timestamp;
// PM4 packet for gfx8 performance counter