diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 3e9f96b2ef..fc30c0f0db 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -599,7 +599,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin return hipSuccess; } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) || ((srcMemory != nullptr) && (dstMemory == nullptr))) { - isHostAsync = false; + // Don't wait for unpinned H2D copy if staging is used for copy + isHostAsync &= ((srcMemory == nullptr) && (dstMemory != nullptr) && AMD_DIRECT_DISPATCH && + (sizeBytes <= stream.device().settings().stagedXferSize_)) ? true : false; } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) { hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice; diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index c995e5e85e..ef82630325 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -707,6 +707,8 @@ class Settings : public amd::HeapObject { //! Enable the specified extension void enableExtension(uint name) { extensions_ |= static_cast(1) << name; } + size_t stagedXferSize_ = 0; //!< Staged buffer size + private: //! Disable copy constructor Settings(const Settings&); diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index bd8aac9a4f..cb270ad5c6 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -98,7 +98,6 @@ class Settings : public device::Settings { uint hostMemDirectAccess_; //!< Enables direct access to the host memory uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - size_t stagedXferSize_; //!< Staged buffer size size_t pinnedXferSize_; //!< Pinned buffer size for transfer size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer size_t cpDmaCopySizeMax_; //!< Threshold for CP DMA path in copy diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 132fe7d2fb..84f659d44e 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -217,11 +217,10 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, } // ================================================================================================ -bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf, +bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging, size_t origin, size_t& offset, size_t& totalSize, size_t xferSize) const { address dst = dstMemory.getDeviceMemory(); - address staging = xferBuf.getDeviceMemory(); // Copy data from host to device dst += origin + offset; @@ -308,16 +307,15 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, } if (dstSize != 0) { - Memory& xferBuf = dev().xferWrite().acquire(); + address staging = gpu().Staging().Acquire( + std::min(dstSize, dev().settings().stagedXferSize_)); // Write memory using a staging resource - if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize, dstSize)) { LogError("DmaBlitManager::writeBuffer failed!"); return false; } - - gpu().addXferWrite(xferBuf); } } @@ -338,8 +336,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire, copyMetadata); } else { - Memory& xferBuf = dev().xferWrite().acquire(); - address staging = xferBuf.getDeviceMemory(); + address staging = gpu().Staging().Acquire( + std::min(size[0], dev().settings().stagedXferSize_)); address dst = static_cast(dstMemory).getDeviceMemory(); size_t srcOffset; @@ -358,7 +356,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem } } } - gpu().addXferWrite(xferBuf); } return true; @@ -780,7 +777,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const { // Stall GPU, sicne CPU copy is possible - gpu().releaseGpuMemoryFence(); + gpu().releaseGpuMemoryFence(hostToDev); // No allocation is necessary for Full Profile hsa_status_t status; @@ -826,8 +823,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ LogPrintfError("Hsa copy from host to device failed with code %d", status); return false; } - gpu().Barriers().WaitCurrent(); totalSize -= size; + if (totalSize > 0) { + // Wait if there are extra copies, which don't fit in a single staging buffer + gpu().Barriers().WaitCurrent(); + } offset += size; continue; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index f01ce40bdf..537fb4f628 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -261,7 +261,7 @@ class DmaBlitManager : public device::HostBlitManager { //! Write into video memory, using a staged buffer bool writeMemoryStaged(const void* srcHost, //!< Source host memory Memory& dstMemory, //!< Destination memory object - Memory& xferBuf, //!< Staged buffer for write + address staging, //!< Staged buffer for write size_t origin, //!< Original offset in the destination memory size_t& offset, //!< Offset for the current copy pointer size_t& totalSize, //!< Total size for the copy region diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index dd4f37b091..e2a82d8c71 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -179,7 +179,6 @@ Device::Device(hsa_agent_t bkendDevice) , alloc_granularity_(0) , xferQueue_(nullptr) , xferRead_(nullptr) - , xferWrite_(nullptr) , freeMem_(0) , vgpusAccess_(true) /* Virtual GPU List Ops Lock */ , hsa_exclusive_gpu_access_(false) @@ -290,7 +289,6 @@ Device::~Device() { // Destroy temporary buffers for read/write delete xferRead_; - delete xferWrite_; // Destroy transfer queue delete xferQueue_; @@ -823,15 +821,6 @@ bool Device::create() { mapCache_->push_back(nullptr); if (settings().stagedXferSize_ != 0) { - // Initialize staged write buffers - if (settings().stagedXferWrite_) { - xferWrite_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferWrite_ == nullptr) || !xferWrite_->create()) { - LogError("Couldn't allocate transfer buffer objects for read"); - return false; - } - } - // Initialize staged read buffers if (settings().stagedXferRead_) { xferRead_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 7111fcec14..6417c9d881 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -520,9 +520,6 @@ class Device : public NullDevice { //! Adds a map target to the cache bool addMapTarget(amd::Memory* memory) const; - //! Returns transfer buffer object - XferBuffers& xferWrite() const { return *xferWrite_; } - //! Returns transfer buffer object XferBuffers& xferRead() const { return *xferRead_; } @@ -653,7 +650,6 @@ class Device : public NullDevice { VirtualGPU* xferQueue_; //!< Transfer queue, created on demand XferBuffers* xferRead_; //!< Transfer buffers read - XferBuffers* xferWrite_; //!< Transfer buffers write std::atomic freeMem_; //!< Total of free memory available mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index 525cfefc38..b15b8cbe64 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -68,7 +68,6 @@ class Settings : public device::Settings { uint numWaitEvents_; //!< The number of wait events for device enqueue size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - size_t stagedXferSize_; //!< Staged buffer size size_t pinnedXferSize_; //!< Pinned buffer size for transfer size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 4f0a93348b..ea3716b4fb 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1065,7 +1065,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, fence_dirty_ = true; auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); - if (!skipSignal) { + if (!skipSignal && (signal.handle == 0)) { // Get active signal for current dispatch if profiling is necessary barrier_packet_.completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); @@ -1188,9 +1188,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD // ================================================================================================ void VirtualGPU::ResetQueueStates() { - // Release all transfer buffers on this command queue - releaseXferWrite(); - // Release all memory dependencies memoryDependency().clear(); @@ -1234,6 +1231,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, schedulerSignal_({0}), barriers_(*this), kernarg_pool_signal_(KernelArgPoolNumSignal), + managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), cuMask_(cuMask), priority_(priority), copy_command_type_(0), @@ -1385,9 +1383,77 @@ bool VirtualGPU::create() { LogError("Could not create signal for copy queue!"); return false; } + // Create managed buffer for staging copies + if (!managed_buffer_.Create()) { + LogError("Could not create managed buffer for this queue!"); + return false; + } return true; } +// ================================================================================================ +VirtualGPU::ManagedBuffer::~ManagedBuffer() { + for (auto& it : pool_signal_) { + if (it.handle != 0) { + hsa_signal_destroy(it); + } + } + if (pool_base_ != nullptr) { + gpu_.dev().hostFree(pool_base_, pool_size_); + } +} + +// ================================================================================================ +bool VirtualGPU::ManagedBuffer::Create() { + pool_chunk_end_ = pool_size_ / kPoolNumSignals; + active_chunk_ = 0; + // Allocate memory for managed buffer + pool_base_ = reinterpret_cast
( + gpu_.dev().hostAlloc(pool_size_, 0, Device::MemorySegment::kNoAtomics)); + if (pool_base_ == nullptr) { + return false; + } + hsa_agent_t agent = gpu_.dev().getBackendDevice(); + for (auto& it : pool_signal_) { + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 1, &agent, &it)) { + return false; + } + } + return true; +} + +// ================================================================================================ +address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) { + auto alignment = gpu_.dev().info().globalMemCacheLineSize_; + address result = nullptr; + result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment); + const size_t pool_new_usage = (result + size) - pool_base_; + if (pool_new_usage <= pool_chunk_end_) { + pool_cur_offset_ = pool_new_usage; + return result; + } else { + // Reset the signal for the barrier packet + hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne); + // Currently don't skip wait signal check, because SDMA engine cna be used in staging copy + constexpr bool kSkipSignal = false; + // Dispatch a barrier packet into the queue + gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]); + // Get the next chunk + active_chunk_ = ++active_chunk_ % kPoolNumSignals; + // Make sure the new active chunk is free + bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait()); + assert(test && "Runtime can't fail a wait for chunk!"); + // Make sure the current offset matches the new chunk to avoid possible overlaps + // between chunks and issues during recycle + pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_; + pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals; + result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment); + pool_cur_offset_ = (result + size) - pool_base_; + } + + return result; +} + // ================================================================================================ bool VirtualGPU::initPool(size_t kernarg_pool_size) { kernarg_pool_size_ = kernarg_pool_size; @@ -3562,28 +3628,6 @@ void VirtualGPU::flush(amd::Command* list, bool wait) { releasePinnedMem(); } -// ================================================================================================ -void VirtualGPU::addXferWrite(Memory& memory) { - //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait - //! unconditionally, before it can release pinned memory - releaseGpuMemoryFence(); - if (xferWriteBuffers_.size() > 7) { - dev().xferWrite().release(*this, *xferWriteBuffers_.front()); - xferWriteBuffers_.erase(xferWriteBuffers_.begin()); - } - - // Delay destruction - xferWriteBuffers_.push_back(&memory); -} - -// ================================================================================================ -void VirtualGPU::releaseXferWrite() { - for (auto& memory : xferWriteBuffers_) { - dev().xferWrite().release(*this, *memory); - } - xferWriteBuffers_.resize(0); -} - // ================================================================================================ void VirtualGPU::addPinnedMem(amd::Memory* mem) { //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 35b0fe7b05..3aeebe02b4 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -185,6 +185,31 @@ class Timestamp : public amd::ReferenceCountedObject { class VirtualGPU : public device::VirtualDevice { public: + class ManagedBuffer : public amd::EmbeddedObject { + public: + //! The number of chunks the arg pool will be divided + static constexpr uint32_t kPoolNumSignals = 4; + ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size) + : gpu_(gpu) + , pool_size_(pool_size) + , pool_signal_(kPoolNumSignals) {} + ~ManagedBuffer(); + + //! Allocates all necessary resources to manage memory + bool Create(); + + //! Acquires memory for use on the gpu + address Acquire(uint32_t size); + + private: + VirtualGPU& gpu_; //!< Queue object for ROCm device + address pool_base_ = nullptr; //!< Memory pool base address + uint32_t pool_size_; //!< Memory pool base size + uint32_t pool_chunk_end_ = 0; //!< The end offset of the current chunk + uint32_t active_chunk_ = 0; //!< The index of the current active chunk + uint32_t pool_cur_offset_ = 0; //!< Current active offset for update + std::vector pool_signal_; //!< Pool of HSA signals to manage multiple chunks + }; class MemoryDependency : public amd::EmbeddedObject { public: //! Default constructor @@ -386,11 +411,8 @@ class VirtualGPU : public device::VirtualDevice { std::vector& wrtBackImageBuffer //!< Images for writeback ); - //! Adds a stage write buffer into a list - void addXferWrite(Memory& memory); - - //! Releases stage write buffers - void releaseXferWrite(); + //! Returns a managed buffer for staging copies + ManagedBuffer& Staging() { return managed_buffer_; } //! Adds a pinned memory object into a map void addPinnedMem(amd::Memory* mem); @@ -422,6 +444,7 @@ class VirtualGPU : public device::VirtualDevice { void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } + // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals @@ -437,10 +460,10 @@ class VirtualGPU : public device::VirtualDevice { template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking); - void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false, - hsa_signal_t signal = hsa_signal_t{0}); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); + void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false, + hsa_signal_t signal = hsa_signal_t{0}); void dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal = false, hsa_signal_t signal = hsa_signal_t{0}, @@ -499,7 +522,6 @@ class VirtualGPU : public device::VirtualDevice { //! Resets the current queue state. Note: should be called after AQL queue becomes idle void ResetQueueStates(); - std::vector xferWriteBuffers_; //!< Stage write buffers std::vector pinnedMems_; //!< Pinned memory list //! Queue state flags @@ -549,6 +571,8 @@ class VirtualGPU : public device::VirtualDevice { std::vector kernarg_pool_signal_; //!< Pool of HSA signals to manage //!< multiple chunks + ManagedBuffer managed_buffer_; //!< Memory manager for staging copies + friend class Timestamp; // PM4 packet for gfx8 performance counter