From 30cf81fc939ff775e5b2ed69d2829e7c0116045e Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Thu, 7 Jan 2021 16:41:30 -0500 Subject: [PATCH] Add HSA signal global tracking logic. Implement the global class for signals tracking per device queue. Switch to the new tracking mechanism. Change-Id: I3c4dda04b34e6d18d6a95510d84102909633b415 [ROCm/clr commit: 8698aeef0d35bb363d9340613e04b5af3fd47ffa] --- projects/clr/rocclr/device/rocm/rocblit.cpp | 172 ++++++------- projects/clr/rocclr/device/rocm/rocblit.hpp | 19 +- projects/clr/rocclr/device/rocm/rocdevice.cpp | 5 + projects/clr/rocclr/device/rocm/rocmemory.cpp | 3 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 227 +++++------------- .../clr/rocclr/device/rocm/rocvirtual.hpp | 147 +++++++++++- 6 files changed, 284 insertions(+), 289 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 320c8885fc..e983af887a 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - gpu().releaseGpuMemoryFence(); + // HSA copy functionality with a possible async operation + gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait); // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + // Stall GPU before CPU access + gpu().Barriers().WaitCurrent(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); } else { size_t srcSize = size[0]; @@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + // HSA copy functionality with a possible async operation gpu().releaseGpuMemoryFence(); // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + // Stall GPU before CPU access + gpu().Barriers().WaitCurrent(); return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); } else { Memory& xferBuf = dev().xferRead().acquire(); @@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + // HSA copy functionality with a possible async operation gpu().releaseGpuMemoryFence(); if (setup_.disableReadImage_) { @@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } else { + // HSA copy functionality with a possible async operation + gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait); + size_t dstSize = size[0]; size_t tmpSize = 0; size_t offset = 0; @@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem const amd::BufferRect& hostRect, const amd::BufferRect& bufRect, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + // HSA copy functionality with a possible async operation gpu().releaseGpuMemoryFence(); // Use host copy if memory has direct access @@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + // HSA copy functionality with a possible async operation gpu().releaseGpuMemoryFence(); if (setup_.disableWriteImage_) { @@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() && (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); } else { return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size); @@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBufferRect_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() && dstMemory.isHostMemDirectAccess())) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); } else { + gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait); void* src = gpuMem(srcMemory).getDeviceMemory(); void* dst = gpuMem(dstMemory).getDeviceMemory(); @@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d } if (isSubwindowRectCopy ) { - hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne); + hsa_signal_t wait = gpu().Barriers().WaitSignal(); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Copy memory line by line - hsa_status_t status = - hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent, - direction, 0, nullptr, completion_signal_); + hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, + &srcMem, &offset, &dim, agent, direction, 1, &wait, active); if (status != HSA_STATUS_SUCCESS) { LogPrintfError("DMA buffer failed with code %d", status); return false; } - - if (!WaitForSignal(completion_signal_)) { - LogError("Async copy failed"); - return false; - } } else { // Fall to line by line copies const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_store_relaxed(completion_signal_, kInitVal); + hsa_signal_t wait = gpu().Barriers().WaitSignal(); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp()); for (size_t z = 0; z < size[2]; ++z) { for (size_t y = 0; y < size[1]; ++y) { @@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d size_t dstOffset = dstRect.offset(0, y, z); // Copy memory line by line - hsa_status_t status = - hsa_amd_memory_async_copy((reinterpret_cast
(dst) + dstOffset), dstAgent, - (reinterpret_cast(src) + srcOffset), srcAgent, - size[0], 0, nullptr, completion_signal_); + hsa_status_t status = hsa_amd_memory_async_copy( + (reinterpret_cast
(dst) + dstOffset), dstAgent, + (reinterpret_cast(src) + srcOffset), srcAgent, + size[0], 1, &wait, active); gpu().setLastCommandSDMA(true) ; if (status != HSA_STATUS_SUCCESS) { LogPrintfError("DMA buffer failed with code %d", status); @@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d } } } - - if (!WaitForSignal(completion_signal_)) { - LogError("Async copy failed"); - return false; - } } - } + // Explicit wait for now, until runtime could distinguish compute and sdma operations + gpu().Barriers().WaitCurrent(); return true; } @@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) { - gpu().releaseGpuMemoryFence(true); - } else { - gpu().releaseGpuMemoryFence(); - } + // HSA copy functionality with a possible async operation, hence make sure GPU is done + bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA(); + gpu().releaseGpuMemoryFence(force_barrier); bool result = false; @@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory } else { Image& srcImage = static_cast(srcMemory); Buffer& dstBuffer = static_cast(dstMemory); - - // Use ROC path for a transfer - // Note: it doesn't support SDMA address dstHost = reinterpret_cast
(dstBuffer.getDeviceMemory()) + dstOrigin[0]; // Use ROCm path for a transfer. @@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done - if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) { - gpu().releaseGpuMemoryFence(true); - } else { - gpu().releaseGpuMemoryFence(); - } + // HSA copy functionality with a possible async operation, hence make sure GPU is done + bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA(); + gpu().releaseGpuMemoryFence(force_barrier); bool result = false; @@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { - // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + // HSA copy functionality with a possible async operation, hence make sure GPU is done gpu().releaseGpuMemoryFence(); bool result = false; @@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, address src = reinterpret_cast
(srcMemory.getDeviceMemory()); address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); - if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) { - gpu().releaseGpuMemoryFence(true); - } + bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA(); + gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait); src += srcOrigin[0]; dst += dstOrigin[0]; @@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, // Just call copy function for full profile hsa_status_t status; if (dev().agent_profile() == HSA_PROFILE_FULL) { + // Stall GPU, sicne CPU copy is possible + gpu().Barriers().WaitCurrent(); status = hsa_memory_copy(dst, src, size[0]); if (status != HSA_STATUS_SUCCESS) { LogPrintfError("Hsa copy of data failed with code %d", status); @@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, srcAgent = dstAgent = dev().getBackendDevice(); } - hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne); - + hsa_signal_t wait = gpu().Barriers().WaitSignal(); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Use SDMA to transfer the data - status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr, - completion_signal_); + status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active); gpu().setLastCommandSDMA(true); + // Explicit wait for now, until runtime could distinguish compute and sdma operations + gpu().Barriers().WaitCurrent(); if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val; - - if (!WaitForSignal(completion_signal_)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; - } else { - gpu().addSystemScope(); - } + gpu().addSystemScope(); } else { LogPrintfError("Hsa copy from host to device failed with code %d", status); } @@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, // ================================================================================================ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const { + // Stall GPU, sicne CPU copy is possible + bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA(); + gpu().releaseGpuMemoryFence(force_barrier); + // No allocation is necessary for Full Profile hsa_status_t status; if (dev().agent_profile() == HSA_PROFILE_FULL) { @@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ size_t offset = 0; address hsaBuffer = staging; - if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) { - gpu().releaseGpuMemoryFence(true); - } // Allocate requested size of memory while (totalSize > 0) { size = std::min(totalSize, dev().settings().stagedXferSize_); - hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Copy data from Host to Device if (hostToDev) { @@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ memcpy(hsaBuffer, hostSrc + offset, size); status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer, - srcAgent, size, 0, nullptr, completion_signal_); + srcAgent, size, 0, nullptr, active); gpu().setLastCommandSDMA(true); - if (status == HSA_STATUS_SUCCESS) { - if (!WaitForSignal(completion_signal_)) { - LogError("Async copy failed"); - return false; - } - } else { + if (status != HSA_STATUS_SUCCESS) { LogPrintfError("Hsa copy from host to device failed with code %d", status); return false; } + gpu().Barriers().WaitCurrent(); totalSize -= size; offset += size; continue; @@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent(); // Copy data from Device to Host - status = - hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset, - dev().getBackendDevice(), size, 0, nullptr, completion_signal_); + status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset, + dev().getBackendDevice(), size, 0, nullptr, active); gpu().setLastCommandSDMA(true); if (status == HSA_STATUS_SUCCESS) { - if (!WaitForSignal(completion_signal_)) { - LogError("Async copy failed"); - return false; - } + gpu().Barriers().WaitCurrent(); memcpy(hostDst + offset, hsaBuffer, size); } else { LogPrintfError("Hsa copy from device to host failed with code %d", status); @@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, releaseArguments(parameters); if (releaseView) { // todo SRD programming could be changed to avoid a stall - if(!dev().settings().barrier_sync_) { - gpu().releaseGpuMemoryFence(true); - } else { - gpu().releaseGpuMemoryFence(); - } + gpu().releaseGpuMemoryFence(); dstView->owner()->release(); } @@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, releaseArguments(parameters); if (releaseView) { // todo SRD programming could be changed to avoid a stall - if(!dev().settings().barrier_sync_) { - gpu().releaseGpuMemoryFence(true); - } else { - gpu().releaseGpuMemoryFence(); - } + gpu().releaseGpuMemoryFence(); srcView->owner()->release(); } @@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost, // Use host copy if memory has direct access if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); synchronize(); return result; @@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor // Use host copy if memory has direct access if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); synchronize(); return result; @@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); synchronize(); return result; @@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); synchronize(); return result; @@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo // Use host copy if memory has direct access if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); synchronize(); return result; @@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst // Use host copy if memory has direct access if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); synchronize(); return result; @@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, // Use host fill if memory has direct access if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire); synchronize(); return result; @@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, // Use host fill if memory has direct access if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) { + // Stall GPU before CPU access + gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); synchronize(); return result; diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index c4f4b89ba9..5f344ff1af 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -40,6 +40,9 @@ class Kernel; class Memory; class VirtualGPU; +constexpr bool kSkipCpuWait = true; +constexpr bool kIgnoreBarrier = false; + //! DMA Blit Manager class DmaBlitManager : public device::HostBlitManager { public: @@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager { ); //! Destructor - virtual ~DmaBlitManager() { - if (completion_signal_.handle != 0) { - hsa_signal_destroy(completion_signal_); - } - } + virtual ~DmaBlitManager() {} //! Creates DmaBlitManager object - virtual bool create(amd::Device& device) { - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) { - return false; - } - return true; - } + virtual bool create(amd::Device& device) { return true; } //! Copies a buffer object to system memory virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object @@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager { size_t xferSize //!< Transfer size ) const; - //! Handle of ROC Device object - hsa_signal_t completion_signal_; - //! Assits in transferring data from Host to Local or vice versa //! taking into account the Hsail profile supported by Hsa Agent bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 7ed5ffc5e6..a19987e3c2 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -260,7 +260,12 @@ Device::~Device() { context().svmFree(coopHostcallBuffer_); coopHostcallBuffer_ = nullptr; } + + if (0 != prefetch_signal_.handle) { + hsa_signal_destroy(prefetch_signal_); + } } + bool NullDevice::initCompiler(bool isOffline) { #if defined(WITH_COMPILER_LIB) // Initialize the compiler handle if has already not been initialized diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index 2f20890cee..9c1055d140 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) { amd::Coord3D(size()), true)) { LogError("[OCL] Fail sync the device memory on cpuUnmap"); } + // Wait on CPU for the transfer + static_cast(vDev).releaseGpuMemoryFence(); } - decIndMapCount(); } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index bb9cfdfacf..bc65024fcf 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket( // Check for queue full and wait if needed. uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); - hsa_signal_t signal; - // TODO: placeholder to setup the kernel to populate start and end timestamp. if (timestamp_ != nullptr) { // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; - if (pool_size >= signal_pool_.size()) { - ProfilingSignal profiling_signal = {}; - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) { - LogPrintfError("Failed signal allocation id = %d", pool_size); - return false; - } - signal_pool_.push_back(profiling_signal); - assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!"); - } - // Move index inside the valid pool - ++current_signal_ %= signal_pool_.size(); - // Find signal slot - ProfilingSignal* profilingSignal = &signal_pool_[current_signal_]; - // Make sure we save the old results in the TS structure - if (profilingSignal->ts_ != nullptr) { - profilingSignal->ts_->checkGpuTime(); - } - // Update the new TS with the signal info - timestamp_->setProfilingSignal(profilingSignal); - packet->completion_signal = profilingSignal->signal_; - profilingSignal->ts_ = timestamp_; - timestamp_->setAgent(gpu_device_); + // Get active signal for current dispatch if profiling is necessary + packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); } // Make sure the slot is free for usage @@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket( // Add blocking command if the original value of read index was behind of the queue size if (blocking || (index - read) >= queueMask) { if (packet->completion_signal.handle == 0) { - packet->completion_signal = barrier_signal_; + packet->completion_signal = Barriers().ActiveSignal(); } - signal = packet->completion_signal; - // Initialize signal for a wait - hsa_signal_store_relaxed(signal, kInitSignalValueOne); blocking = true; } - // If runtime doesn't use the barrier, then make sure it tracks the last submitted command - if (!dev().settings().barrier_sync_) { - // Initialize signal for a wait - assert(packet->completion_signal.handle != 0 && - "There is no HSA signal associated with the last command!"); - hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne); - last_signal_ = packet->completion_signal; - } - // Insert packet(s) // NOTE: need multiple packets to dispatch the performance counter // packet blob of the legacy devices (gfx8) @@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket( // Wait on signal ? if (blocking) { - if (!WaitForSignal(signal)) { - LogPrintfError("Failed signal [0x%lx] wait", signal.handle); + if (!Barriers().WaitCurrent()) { + LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle); return false; } - // Reset the pool of signals - current_signal_ = 0; } return true; @@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } + +// ================================================================================================ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { assert(packet->completion_signal.handle != 0); const uint32_t queueSize = gpu_queue_->size; @@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); } +// ================================================================================================ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, uint16_t packetHeader, hsa_signal_t signal) { const uint32_t queueSize = gpu_queue_->size; @@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, if (signal.handle == 0) { // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; - if (pool_size >= signal_pool_.size()) { - ProfilingSignal profiling_signal = {}; - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) { - LogPrintfError("Failed signal allocation id = %d", pool_size); - } - signal_pool_.push_back(profiling_signal); - assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!"); - } - // Move index inside the valid pool - ++current_signal_ %= signal_pool_.size(); - // Find signal slot - ProfilingSignal* profilingSignal = &signal_pool_[current_signal_]; - // Make sure we save the old results in the TS structure - if (profilingSignal->ts_ != nullptr) { - profilingSignal->ts_->checkGpuTime(); - } - if (timestamp_ != nullptr) { - // Update the new TS with the signal info - timestamp_->setProfilingSignal(profilingSignal); - profilingSignal->ts_ = timestamp_; - timestamp_->setAgent(gpu_device_); - } - packet->completion_signal = profilingSignal->signal_; - hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne); + // Get active signal for current dispatch if profiling is necessary + packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); } else { assert(signal.handle != 0); packet->completion_signal = signal; @@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() { // Release the pool, since runtime just completed a barrier // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued resetKernArgPool(); - } else { - // Reset the pool of signals - current_signal_ = 0; } } // ================================================================================================ -bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) { - // Return if there is no pending dispatch - if (!hasPendingDispatch_) { - if (dev().settings().barrier_sync_ || !force_barrier) { - return false; - } - } - hsa_signal_t wait_signal = barrier_signal_; +bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) { + if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) { + barrier_packet_.completion_signal = Barriers().ActiveSignal(); - // If barrier sync was requested or runtime didn't provide the last signal - if (dev().settings().barrier_sync_ || force_barrier) { - // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne); - - // Dispatch barrier packet into the queue and wait till it finishes. + // Dispatch barrier packet into the queue dispatchBarrierPacket(&barrier_packet_); - } - else { - // Take the signal of the last submitted dispatch - wait_signal = last_signal_; + hasPendingDispatch_ = false; } - // Wait for compute work previously submitted - if (!WaitForSignal(wait_signal)) { - LogError("Waiting for compute work failed!"); - return false; + // Check if runtime could skip CPU wait + if (!skip_cpu_wait) { + Barriers().WaitCurrent(); + + ResetQueueStates(); } - - hasPendingDispatch_ = false; - - ResetQueueStates(); return true; } @@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, } aqlHeader_ = dispatchPacketHeader_; - barrier_signal_.handle = 0; // Note: Virtual GPU device creation must be a thread safe operation roc_device_.vgpus_.resize(roc_device_.numOfVgpus_); @@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, } +// ================================================================================================ VirtualGPU::~VirtualGPU() { delete blitMgr_; // Release the resources of signal releaseGpuMemoryFence(); - if (barrier_signal_.handle != 0) { - hsa_signal_destroy(barrier_signal_); - } - destroyPool(); releasePinnedMem(); @@ -868,8 +790,7 @@ bool VirtualGPU::create() { gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_); if (!gpu_queue_) return false; - if (!initPool(dev().settings().kernargPoolSize_, - (profiling_ || (amd::IS_HIP)) ? queue_size : 0)) { + if (!initPool(dev().settings().kernargPoolSize_)) { LogError("Couldn't allocate arguments/signals for the queue"); return false; } @@ -881,17 +802,9 @@ bool VirtualGPU::create() { return false; } - // Create signal for the barrier packet. - hsa_signal_t signal = {0}; - if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) { - return false; - } - barrier_signal_ = signal; - // Initialize barrier packet. memset(&barrier_packet_, 0, sizeof(barrier_packet_)); barrier_packet_.header = kInvalidAql; - barrier_packet_.completion_signal = barrier_signal_; // Create a object of PrintfDbg printfdbg_ = new PrintfDbg(roc_device_); @@ -912,62 +825,32 @@ bool VirtualGPU::create() { return false; } + // Allocate signal tracker for ROCr copy queue + if (!Barriers().Create(gpu_device())) { + LogError("Could not create signal for copy queue!"); + return false; + } return true; } -bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) { +// ================================================================================================ +bool VirtualGPU::initPool(size_t kernarg_pool_size) { kernarg_pool_size_ = kernarg_pool_size; kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, false)); if (kernarg_pool_base_ == nullptr) { return false; } - - // Optimization : - // For better resource utilization runtime should create them only when required - // In case of HIP, Apps create short live streams which do not need more signals - // hence starting with smaller number 32. There is code inplace to grow the pool - // later when it is needed. - bool forced_default_pool_sz = false; - if (!profiling_ && (amd::IS_HIP)) { - forced_default_pool_sz = true; - } - - if (signal_pool_count != 0) { - // Reserve signal pool for all entries in the queue, since profiling logic will save the - // pointer in timestamp info for the future references - signal_pool_.reserve(signal_pool_count); - // If barrier is disable, then allocate a small portion of all signals and grow the array later. - // @note: the optimization requires a wait for signal on reuse, which is only available when - // the barrier is disabled - constexpr uint32_t kDefaultSignalPoolSize = 32; - const uint32_t default_signal_pool_size = - (dev().settings().barrier_sync_ && !forced_default_pool_sz) ? - signal_pool_count : kDefaultSignalPoolSize; - signal_pool_.resize(default_signal_pool_size); - for (uint i = 0; i < default_signal_pool_size; ++i) { - ProfilingSignal profilingSignal; - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) { - return false; - } - signal_pool_[i] = profilingSignal; - } - } - return true; } +// ================================================================================================ void VirtualGPU::destroyPool() { if (kernarg_pool_base_ != nullptr) { roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_); } - - if (signal_pool_.size() > 0) { - for (uint i = 0; i < signal_pool_.size(); ++i) { - hsa_signal_destroy(signal_pool_[i].signal_); - } - } } +// ================================================================================================ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { char* result = nullptr; do { @@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { //! We can issue a barrier to avoid expensive extra memory allocations. // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne); + barrier_packet_.completion_signal = Barriers().ActiveSignal(); // Dispatch barrier packet into the queue and wait till it finishes. dispatchBarrierPacket(&barrier_packet_); - if (!WaitForSignal(barrier_signal_)) { + if (!Barriers().WaitCurrent()) { LogError("Kernel arguments reset failed"); } - resetKernArgPool(); - - // Reset the pool of signals - current_signal_ = 0; } } while (true); return result; } +// ================================================================================================ /* profilingBegin, when profiling is enabled, creates a timestamp to save in * virtualgpu's timestamp_, and calls start() to get the current host * timestamp. @@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { if (command.profilingInfo().enabled_) { if (timestamp_ != nullptr) { - LogWarning( - "Trying to create a second timestamp in VirtualGPU. \ - This could have unintended consequences."); + LogWarning("Trying to create a second timestamp in VirtualGPU. \ + This could have unintended consequences."); return; } // Without barrier profiling will wait for each individual signal @@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { } } +// ================================================================================================ /* profilingEnd, when profiling is enabled, checks to see if a signal was * created for whatever command we are running and calls end() to get the * current host timestamp if no signal is available. It then saves the pointer @@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) { } } +// ================================================================================================ void VirtualGPU::updateCommandsState(amd::Command* list) { Timestamp* ts = nullptr; @@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { // ================================================================================================ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { #if AMD_HMM_SUPPORT + profilingBegin(cmd); // Initialize signal for the barrier - hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne); + hsa_signal_t wait = Barriers().WaitSignal(); + hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); // Find the requested agent for the transfer hsa_agent_t agent = (cmd.cpu_access() || @@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { // Initiate a prefetch command hsa_status_t status = hsa_amd_svm_prefetch_async( - const_cast(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_); + const_cast(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active); - // Wait for the prefetch - if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) { + // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution. + if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { LogError("hsa_amd_svm_prefetch_async failed"); cmd.setStatus(CL_INVALID_OPERATION); } // Add system scope, since the prefetch scope is unclear addSystemScope(); + + profilingEnd(cmd); #endif // AMD_HMM_SUPPORT } @@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } if (gpuKernel.dynamicParallelism()) { + barrier_packet_.completion_signal.handle = 0; dispatchBarrierPacket(&barrier_packet_); static_cast(blitMgr()).runScheduler( - getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_); + getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_); } // Check if image buffer write back is required @@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { uint16_t header = kNopPacketHeader; hsa_signal_t sig { 0 }; dispatchGenericBarrierPacket(&barrier_packet_, header, sig); - last_signal_ = barrier_packet_.completion_signal; - // Restore barrier signal - barrier_packet_.completion_signal = barrier_signal_; } profilingEnd(vcmd); } @@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { profilingBegin(vcmd); if (!dev().settings().barrier_sync_) { // Force barrier to make sure L2 flush, since interop can be in sysmem - constexpr bool ForceBarrier = true; - releaseGpuMemoryFence(ForceBarrier); + constexpr bool kForceBarrier = true; + releaseGpuMemoryFence(kForceBarrier); } profilingEnd(vcmd); } @@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { // ================================================================================================ void VirtualGPU::addXferWrite(Memory& memory) { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); if (xferWriteBuffers_.size() > 7) { dev().xferWrite().release(*this, *xferWriteBuffers_.front()); xferWriteBuffers_.erase(xferWriteBuffers_.begin()); @@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) { xferWriteBuffers_.push_back(&memory); } +// ================================================================================================ void VirtualGPU::releaseXferWrite() { for (auto& memory : xferWriteBuffers_) { dev().xferWrite().release(*this, *memory); @@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() { xferWriteBuffers_.resize(0); } +// ================================================================================================ void VirtualGPU::addPinnedMem(amd::Memory* mem) { + //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait + //! unconditionally, before it can release pinned memory + releaseGpuMemoryFence(); if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); @@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) { } } +// ================================================================================================ void VirtualGPU::releasePinnedMem() { for (auto& amdMemory : pinnedMems_) { amdMemory->release(); @@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() { pinnedMems_.resize(0); } +// ================================================================================================ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { for (auto& amdMemory : pinnedMems_) { if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 7f8b93f49d..413159cf3f 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -36,10 +36,11 @@ class Memory; class Timestamp; struct ProfilingSignal : public amd::HeapObject { - hsa_signal_t signal_; //!< HSA signal to track profiling information - Timestamp* ts_; //!< Timestamp object associated with the signal + hsa_signal_t signal_; //!< HSA signal to track profiling information + Timestamp* ts_; //!< Timestamp object associated with the signal + bool done_; //!< True if signal is done - ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; } + ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; } }; // Initial HSA signal value @@ -111,13 +112,19 @@ class Timestamp { hsa_amd_profiling_dispatch_time_t time; if (splittedDispatch_) { - uint64_t start = UINT64_MAX; + uint64_t start = std::numeric_limits::max(); uint64_t end = 0; for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) { WaitForSignal(*it); } hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); + if ((time.end - time.start) == 0) { + hsa_amd_profiling_async_copy_time_t time_sdma = {}; + hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma); + time.start = time_sdma.start; + time.end = time_sdma.end; + } if (time.start < start) { start = time.start; } @@ -133,10 +140,18 @@ class Timestamp { WaitForSignal(profilingSignal_->signal_); } hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); - start_ = time.start * ticksToTime_; - end_ = time.end * ticksToTime_; + if ((time.end - time.start) == 0) { + hsa_amd_profiling_async_copy_time_t time_sdma = {}; + hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma); + start_ = time_sdma.start * ticksToTime_; + end_ = time_sdma.end * ticksToTime_; + } else { + start_ = time.start * ticksToTime_; + end_ = time.end * ticksToTime_; + } } profilingSignal_->ts_ = nullptr; + profilingSignal_->done_ = true; profilingSignal_ = nullptr; } } @@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice { size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue }; + class HwQueueTracker : public amd::EmbeddedObject { + public: + HwQueueTracker() {} + + ~HwQueueTracker() { + for (auto& signal: signal_list_) { + if (signal->signal_.handle != 0) { + hsa_signal_destroy(signal->signal_); + } + delete signal; + } + } + + //! Creates a pool of signals for tracking of HW operations on the queue + bool Create(hsa_agent_t agent) { + constexpr size_t kSignalListSize = 16; + signal_list_.resize(kSignalListSize); + for (uint i = 0; i < kSignalListSize; ++i) { + ProfilingSignal* signal = new ProfilingSignal(); + if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create( + 0, 1, &agent, &signal->signal_))) { + return false; + } + signal_list_[i] = signal; + } + agent_ = agent; + return true; + } + + //! Finds a free signal for the upcomming operation + hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne, + Timestamp* ts = nullptr, uint32_t queue_size = 0) { + // If queue size grows, then add more signals to avoid more frequent stalls + if (queue_size > signal_list_.size()) { + ProfilingSignal* signal = new ProfilingSignal(); + if (signal != nullptr) { + if (HSA_STATUS_SUCCESS == hsa_signal_create( + 0, 1, &agent_, &signal->signal_)) { + signal_list_.push_back(signal); + } + } + } + // Find valid index + ++current_id_ %= signal_list_.size(); + + // Make sure the previous operation on the current signal is done + WaitCurrent(); + + // Have to wait the next signal in the queue to avoid a race condition between + // a GPU waiter(which may be not triggered yet) and CPU signal reset below + WaitNext(); + + // Reset the signal and return + hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val); + signal_list_[current_id_]->done_ = false; + if (ts != 0) { + if (!sdma_profiling_) { + hsa_amd_profiling_async_copy_enable(true); + sdma_profiling_ = true; + } + signal_list_[current_id_]->ts_ = ts; + ts->setProfilingSignal(signal_list_[current_id_]); + ts->setAgent(agent_); + } + return signal_list_[current_id_]->signal_; + } + + //! Wait for the curent active signal. Can idle the queue + bool WaitCurrent() { return WaitIndex(current_id_); } + + //! Returns the last submitted signal for a wait + hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; } + + private: + //! Wait for the next active signal + void WaitNext() { + size_t next = (current_id_ + 1) % signal_list_.size(); + WaitIndex(next); + } + + //! Wait for the provided signal + bool WaitIndex(size_t index) { + // Wait for the current signal + if (!signal_list_[index]->done_) { + // Update timestamp values if requested + if (signal_list_[index]->ts_ != nullptr) { + signal_list_[index]->ts_->checkGpuTime(); + } else { + if (!WaitForSignal(signal_list_[index]->signal_)) { + LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_); + return false; + } + signal_list_[index]->done_ = true; + } + } + return true; + } + std::vector signal_list_; //!< The pool of all signals for processing + size_t current_id_ = 0; //!< Last submitted signal + hsa_agent_t agent_; //!< HSA device agent + bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default + }; + VirtualGPU(Device& device, bool profiling = false, bool cooperative = false, const std::vector& cuMask = {}, amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); @@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice { * * @return bool true if Wait returned successfully, false otherwise */ - bool releaseGpuMemoryFence(bool force_barrier = false); + bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false); hsa_agent_t gpu_device() { return gpu_device_; } hsa_queue_t* gpu_queue() { return gpu_queue_; } @@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice { void addSystemScope() { addSystemScope_ = true; } void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; } + HwQueueTracker& Barriers() { return barriers_; } + + Timestamp* timestamp() const { return timestamp_; } + // } roc OpenCL integration private: bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, @@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice { void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes); - bool initPool(size_t kernarg_pool_size, uint signal_pool_count); + bool initPool(size_t kernarg_pool_size); void destroyPool(); void* allocKernArg(size_t size, size_t alignment); @@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice { uint32_t cooperative_ : 1; //!< Cooperative launch is enabled uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql uint32_t isLastCommandSDMA_ : 1; //!< Keep track if the last command was SDMA and - //!< not send Barrier packets if barrier_sync is 0 + //!< not send Barrier packets if barrier_sync is 0 }; uint32_t state_; }; @@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice { hsa_agent_t gpu_device_; //!< Physical device hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu hsa_barrier_and_packet_t barrier_packet_; - hsa_signal_t barrier_signal_; - hsa_signal_t last_signal_ = {}; //!< Last submitted signal + uint32_t dispatch_id_; //!< This variable must be updated atomically. Device& roc_device_; //!< roc device object PrintfDbg* printfdbg_; @@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice { hsa_queue_t* schedulerQueue_; hsa_signal_t schedulerSignal_; + HwQueueTracker barriers_; //!< Tracks active barriers in ROCr + char* kernarg_pool_base_; size_t kernarg_pool_size_; uint kernarg_pool_cur_offset_; - std::vector signal_pool_; //!< Pool of signals for profiling - uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool friend class Timestamp; // PM4 packet for gfx8 performance counter