diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 1471cc4187..2ccd922244 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -30,7 +30,9 @@ namespace amd::roc { DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) : HostBlitManager(gpu, setup), - MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_), + MinSizeForPinnedXfer(dev().settings().pinnedMinXferSize_), + PinXferSize(dev().settings().pinnedXferSize_), + StagingXferSize(dev().settings().stagedXferSize_), completeOperation_(false), context_(nullptr) { dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_); @@ -57,20 +59,19 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata); - } else { - size_t copySize = size[0]; - - if (0 != copySize) { - const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0]; - address addrDst = reinterpret_cast
(dstHost); - constexpr bool kHostToDev = false; - if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) { - LogError("DmaBlitManager::readBuffer staged copy failed!"); - return false; - } - } } + size_t copySize = size[0]; + if (copySize > 0) { + const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0]; + address addrDst = reinterpret_cast
(dstHost); + constexpr bool kHostToDev = false; + constexpr bool kEnablePin = true; + if (!hsaCopyStagedOrPinned(addrSrc, addrDst, copySize, kHostToDev, copyMetadata, kEnablePin)) { + LogError("DmaBlitManager:: readBuffer copy failure!"); + return false; + } + } return true; } @@ -100,7 +101,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, // Copy data from device to host - line by line address dst = reinterpret_cast
(dstHost) + dstOffset; - bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], + bool retval = hsaCopyStagedOrPinned(src + srcOffset, dst, size[0], false, copyMetadata); if (!retval) { return retval; @@ -142,23 +143,18 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata); - } else { - size_t copySize = size[0]; - - // For small copies use managed staging buffers which can be non blocking - if (copySize != 0) { - address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0]; - const_address srcAddr = reinterpret_cast(srcHost); - // Write memory using a staging resource - constexpr bool kHostToDev = true; - bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata); - if (!result) { - LogError("DmaBlitManager::writeBuffer staging copy failed!"); - return false; - } + } + size_t copySize = size[0]; + if (copySize > 0) { + address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0]; + const_address srcAddr = reinterpret_cast(srcHost); + constexpr bool kHostToDev = true; + constexpr bool enablePin = true; + if (!hsaCopyStagedOrPinned(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata, enablePin)) { + LogError("DmaBlitManager:: writeBuffer copy failure!"); + return false; } } - return true; } @@ -188,7 +184,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem // Copy data from host to device - line by line const_address src = reinterpret_cast(srcHost) + srcOffset; constexpr bool kHostToDev = true; - bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata); + bool retval = hsaCopyStagedOrPinned(src, dst + dstOffset, size[0], kHostToDev, + copyMetadata); if (!retval) { return retval; } @@ -503,6 +500,9 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) { copyMask = gpu().getLastUsedSdmaEngine(); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask); + copyMask &= (engine == HwQueueEngine::SdmaRead ? + sdmaEngineReadMask_ : sdmaEngineWriteMask_); if (copyMask == 0) { // Check SDMA engine status status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); @@ -584,71 +584,117 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata); } + // ================================================================================================ -bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size, - bool hostToDev, amd::CopyMetadata& copyMetadata) const { - gpu().releaseGpuMemoryFence(kSkipCpuWait); +// Get Staging or Pinned memory buffer +void DmaBlitManager::getBuffer(const_address hostMem, size_t size, + bool enablePin, bool first_tx, + DmaBlitManager::BufferState &buffState) const { + bool doHostPinning = enablePin && ( size > MinSizeForPinnedXfer); + size_t copyChunkSize = doHostPinning ? PinXferSize : StagingXferSize; + size_t xferSize = std::min(size, copyChunkSize); - size_t totalSize = size; - size_t stagedCopyOffset = 0; - bool status = true; - Memory* xferBuf = nullptr; - address stagingBuffer = 0; - size_t maxStagedXferSize = dev().settings().stagedXferSize_; - - if (!hostToDev) { - // Get static staging buffer as we need to wait until copy on GPU completes to copy - // it back to the unpinned buffer - xferBuf = &dev().xferRead().acquire(); - stagingBuffer = xferBuf->getDeviceMemory(); + if (doHostPinning) { // Pin host Memory + char* alignedHost = reinterpret_cast(const_cast(hostMem)); + size_t partial1 = 0; + size_t partial2 = 0; + if (xferSize > PinXferSize && first_tx) { + //Align to 4K boundary + alignedHost = const_cast(amd::alignDown(reinterpret_cast(hostMem), + PinnedMemoryAlignment)); + // Find partial size of unaligned copy + partial2 = reinterpret_cast(hostMem) - alignedHost; + size_t tmpSize = amd::alignUp(PinXferSize + partial2, PinnedMemoryAlignment); + xferSize = std::min(tmpSize - partial2, size); + } + amd::Memory* pinnedMem = pinHostMemory(alignedHost, xferSize, partial1); + if (pinnedMem != nullptr) { + Memory* pinnedMemory = dev().getRocMemory(pinnedMem); + address pinBuffer = pinnedMemory->getDeviceMemory(); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", xferSize); + buffState.copySize_ = xferSize; + buffState.buffer_ = pinBuffer + partial1 + partial2; + buffState.pinnedMem_ = pinnedMem; + return; + } + LogWarning("DmaBlitManager::getBuffer failed to pin a resource!"); } + // If Memory Pinning fails, failback to staging buffer + xferSize = std::min(xferSize, StagingXferSize); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", xferSize); + buffState.copySize_ = xferSize; + buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize)); +} - // Allocate requested size of memory - while (totalSize > 0) { - size = std::min(totalSize, maxStagedXferSize); +void DmaBlitManager::releaseBuffer(BufferState &buffer) const { + if (buffer.pinnedMem_) { + gpu().addPinnedMem(buffer.pinnedMem_); + } +} - hsa_agent_t srcAgent; - hsa_agent_t dstAgent; +// ================================================================================================ +bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDst, + size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, + bool enablePin) const { + gpu().releaseGpuMemoryFence(kSkipCpuWait); + // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer + // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning + bool status = true; + size_t copyOffset = 0; + size_t totalSize = size; - // Copy data from Host to Device - if (hostToDev) { - hsa_agent_t srcAgent = dev().getCpuAgent(); - hsa_agent_t dstAgent = dev().getBackendDevice(); - - // Get an address from managed staging buffer - stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize)); - - address dst = hostDst + stagedCopyOffset; - memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size); + // Staging Buffer or Pinned Host Memory + address stagingBuffer = 0; + // src and dst agent for rocr + hsa_agent_t srcAgent = hostToDev ? dev().getCpuAgent() : dev().getBackendDevice(); + hsa_agent_t dstAgent = hostToDev ? dev().getBackendDevice() : dev().getCpuAgent(); + bool firstTx = true; + while(totalSize > 0) { + size_t outsize = totalSize; + const_address hostmem = hostToDev ? hostSrc : hostDst; + // Get Pinned Host Memory or Staging buffer based on copy size + BufferState buffer{0}; + getBuffer(static_cast(hostmem + copyOffset), outsize, + enablePin, firstTx, buffer); + size_t copysize = buffer.copySize_; + address stagingBuffer = buffer.buffer_; + if (stagingBuffer == 0) { + LogWarning("DmaBlitManager::hsaCopyStagedOrPinned Buffer creation failed!"); + status = false; + break; + } + if (hostToDev) { // H2D Path + if (buffer.pinnedMem_ == nullptr) { // Copy to Staging Buffer + memcpy(stagingBuffer, hostSrc + copyOffset, copysize); + } ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D"); - status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata); + address dst = hostDst + copyOffset; + status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, copysize, copyMetadata); if (!status) { break; } - } else { - dstAgent = dev().getCpuAgent(); - srcAgent = dev().getBackendDevice(); - - const_address src = static_cast(hostSrc) + stagedCopyOffset; + } else { // D2H Path ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H"); - status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata); - if (status) { - gpu().Barriers().WaitCurrent(); - memcpy(hostDst + stagedCopyOffset, stagingBuffer, size); + const_address src = static_cast(hostSrc) + copyOffset; + status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata); + if (status ) { + if (buffer.pinnedMem_ == nullptr) { // Blocking copy from Staging Buffer + gpu().Barriers().WaitCurrent(); + memcpy(hostDst + copyOffset, stagingBuffer, copysize); + } } else { break; } } - - totalSize -= size; - stagedCopyOffset += size; + // Release Pinned Memory back to pool + releaseBuffer(buffer); + // Update Offset and Transfer Size + copyOffset += copysize; + totalSize -= copysize; + firstTx = false; } - if (!hostToDev) { - dev().xferRead().release(gpu(), *xferBuf); - } - - if (!status) { + if(!status) { return false; } @@ -656,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ return true; } - // ================================================================================================ KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), @@ -1617,87 +1662,61 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, synchronize(); return result; } else { - size_t totalSize = size[0]; ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned read path, Async = %d", copyMetadata.isAsync_); - // Check if a pinned transfer can be executed with a single pin - if (((totalSize <= dev().settings().pinnedXferSize_) && - (totalSize > MinSizeForPinnedTransfer))) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial); + size_t totalSize = size[0]; + // Do a staging copy + bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || + (totalSize <= dev().settings().sdmaCopyThreshold_) || + (copyMetadata.copyEnginePreference_ == + amd::CopyMetadata::CopyEnginePreference::BLIT); - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, - copyMetadata); - synchronize(); - return result; - } + if (!useShaderCopyPath) { + // HSA copy using a staging resource + result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, + entire, copyMetadata); + } + if (!result) { + // Blit copy using a staging resource + address srcAddr = gpuMem(srcMemory).getDeviceMemory(); + address dstAddr = reinterpret_cast
(dstHost); + amd::Coord3D dstOrigin(0, 0, 0); + size_t copySize = 0; + size_t stagedCopyOffset = 0; + size_t maxStagedXferSize = dev().settings().stagedXferSize_; + Memory& xferBuf = dev().xferRead().acquire(); + address xferBufAddr = xferBuf.getDeviceMemory(); - // Readjust host mem offset - amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getRocMemory(amdMemory); - - // Copy image to buffer - result = copyBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire, copyMetadata); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } else { - // Do a staging copy - bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || - (totalSize <= dev().settings().sdmaCopyThreshold_) || - (copyMetadata.copyEnginePreference_ == - amd::CopyMetadata::CopyEnginePreference::BLIT); - - if (!useShaderCopyPath) { - // HSA copy using a staging resource - result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, - entire, copyMetadata); - } - if (!result) { - // Blit copy using a staging resource - address srcAddr = gpuMem(srcMemory).getDeviceMemory(); - address dstAddr = reinterpret_cast
(dstHost); - amd::Coord3D dstOrigin(0, 0, 0); - size_t copySize = 0; - size_t stagedCopyOffset = 0; - size_t maxStagedXferSize = dev().settings().stagedXferSize_; - Memory& xferBuf = dev().xferRead().acquire(); - address xferBufAddr = xferBuf.getDeviceMemory(); - - constexpr bool kAttachSignal = true; - while (totalSize > 0) { - copySize = std::min(totalSize, maxStagedXferSize); - srcAddr += stagedCopyOffset; - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, " - "dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); - // Flush caches for coherency after the copy as we need to std::memcpy - // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet - // itself that we can wait on without extra barrier packet. - gpu().addSystemScope(); - result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize, - entire, dev().settings().limit_blit_wg_, copyMetadata, - kAttachSignal); - if (!result) { - break; - } - // Wait on current signal of previous blit copy - gpu().Barriers().WaitCurrent(); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", - (void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize); - memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize); - totalSize -= copySize; - stagedCopyOffset += copySize; + constexpr bool kAttachSignal = true; + while (totalSize > 0) { + copySize = std::min(totalSize, maxStagedXferSize); + srcAddr += stagedCopyOffset; + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, " + "dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); + // Flush caches for coherency after the copy as we need to std::memcpy + // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet + // itself that we can wait on without extra barrier packet. + gpu().addSystemScope(); + result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize, + entire, dev().settings().limit_blit_wg_, copyMetadata, + kAttachSignal); + if (!result) { + break; } - - dev().xferRead().release(gpu(), xferBuf); + // Wait on current signal of previous blit copy + gpu().Barriers().WaitCurrent(); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", + (void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize); + memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize); + totalSize -= copySize; + stagedCopyOffset += copySize; } + + dev().xferRead().release(gpu(), xferBuf); } } + synchronize(); return result; @@ -1773,79 +1792,50 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo synchronize(); return result; } else { - size_t totalSize = size[0]; ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path, Async = %d", copyMetadata.isAsync_); - // If size > min pinned size, do a pinning copy, since we are limited by staging buffer size - // Check if a pinned transfer can be executed with a single pin - if ((totalSize <= dev().settings().pinnedXferSize_) && - (totalSize > MinSizeForPinnedTransfer)) { - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize); - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial); + size_t totalSize = size[0]; + // Do a staging copy + bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || + (totalSize <= dev().settings().sdmaCopyThreshold_) || + (copyMetadata.copyEnginePreference_ == + amd::CopyMetadata::CopyEnginePreference::BLIT); - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, - size, entire, copyMetadata); - synchronize(); - return result; - } + if (!useShaderCopyPath) { + // HSA copy using a staging resource + result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, + size, entire, copyMetadata); + } - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); + if (!result) { + // Blit copy using a staging resource + address dstAddr = gpuMem(dstMemory).getDeviceMemory(); + const_address srcAddr = reinterpret_cast(srcHost); + amd::Coord3D srcOrigin(0, 0, 0); + size_t copySize = 0; + size_t stagedCopyOffset = 0; + size_t maxStagedXferSize = dev().settings().stagedXferSize_; - // Get device memory for this virtual device - Memory* srcMemory = dev().getRocMemory(amdMemory); - - // Copy buffer - result = copyBuffer(*srcMemory, dstMemory, srcOrigin, origin, size, entire, copyMetadata); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } else { - // Do a staging copy - bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || - (totalSize <= dev().settings().sdmaCopyThreshold_) || - (copyMetadata.copyEnginePreference_ == - amd::CopyMetadata::CopyEnginePreference::BLIT); - - if (!useShaderCopyPath) { - // HSA copy using a staging resource - result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, - size, entire, copyMetadata); - } - - if (!result) { - // Blit copy using a staging resource - address dstAddr = gpuMem(dstMemory).getDeviceMemory(); - const_address srcAddr = reinterpret_cast(srcHost); - amd::Coord3D srcOrigin(0, 0, 0); - size_t copySize = 0; - size_t stagedCopyOffset = 0; - size_t maxStagedXferSize = dev().settings().stagedXferSize_; - - while (totalSize > 0) { - copySize = std::min(totalSize, maxStagedXferSize); - // Get an address from managed staging buffer - address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize)); - dstAddr += stagedCopyOffset; - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", - stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize); - memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, " - "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); - // No cache flush is needed here as we use a staging buffer, and the acquire logic - // ensures that the cacheline is different and re-used only when L2 is flushed - result = shaderCopyBuffer(dstAddr, stagingBuffer, - origin, srcOrigin, copySize, - entire, dev().settings().limit_blit_wg_, copyMetadata); - if (!result) { - break; - } - totalSize -= copySize; - stagedCopyOffset += copySize; + while (totalSize > 0) { + copySize = std::min(totalSize, maxStagedXferSize); + // Get an address from managed staging buffer + address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize)); + dstAddr += stagedCopyOffset; + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", + stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize); + memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, " + "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); + // No cache flush is needed here as we use a staging buffer, and the acquire logic + // ensures that the cacheline is different and re-used only when L2 is flushed + result = shaderCopyBuffer(dstAddr, stagingBuffer, + origin, srcOrigin, copySize, + entire, dev().settings().limit_blit_wg_, copyMetadata); + if (!result) { + break; } + totalSize -= copySize; + stagedCopyOffset += copySize; } } } diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index 4d9e295f38..d2bad22d20 100644 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -208,7 +208,6 @@ class DmaBlitManager : public device::HostBlitManager { } protected: - static constexpr uint MaxPinnedBuffers = 4; //! Synchronizes the blit operations if necessary inline void synchronize() const; @@ -237,7 +236,10 @@ class DmaBlitManager : public device::HostBlitManager { const_address src, hsa_agent_t& srcAgent, size_t size, amd::CopyMetadata& copyMetadata) const; - const size_t MinSizeForPinnedTransfer; + const size_t PinXferSize; //!< Copy size for Pinned Copy + const size_t MinSizeForPinnedXfer; //!< Mininum copy size for Pinned Copy + const size_t StagingXferSize; //!< Copy size for Staging Buffer Copy + bool completeOperation_; //!< DMA blit manager must complete operation amd::Context* context_; //!< A dummy context uint32_t sdmaEngineReadMask_; //!< SDMA Engine Read Mask @@ -250,14 +252,30 @@ class DmaBlitManager : public device::HostBlitManager { //! Disable operator= DmaBlitManager& operator=(const DmaBlitManager&); - //! Assits in transferring data from Host to Local or vice versa - //! taking into account the Hsail profile supported by Hsa Agent - bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied - address hostDst, //!< Destination buffer address for copying - size_t size, //!< Size of data to copy in bytes - bool hostToDev, //!< True if data is copied from H2D - amd::CopyMetadata& copyMetadata //!< Memory copy MetaData - ) const; + bool hsaCopyStagedOrPinned(const_address hostSrc, //!< Src buffer address + address hostDst, //!< Dst Buffer address + size_t size, //!< Size of copy data in bytes + bool hostToDev, //!< True for H2D copy + amd::CopyMetadata& copyMetadata, //!< copy MetaData + bool enPinning = false //!< True if pinning required + ) const; + struct BufferState{ + address buffer_; //!< Staging Buffer or Pinned Host Mem Address + amd::Memory* pinnedMem_; //!< Pinned Memory + size_t copySize_; //!< last copy size + }; + + // Get Pinned Host Memory or Staging Buffer + void getBuffer(const_address hostMem, //!< Host Mem Address + size_t size, //!< Transfer Size + bool enablePin, //!< True when Pinning is enabled + bool first_tx, //!< True for first copy + BufferState &buffer //!< State of Buffer + ) const; + + // Release Pinned host memory + void releaseBuffer(BufferState &buff //!< True if last copy used Pinned resource + ) const; }; //! Kernel Blit Manager diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp index cd2006fedb..fe3fab3ed2 100644 --- a/rocclr/device/rocm/rocsettings.cpp +++ b/rocclr/device/rocm/rocsettings.cpp @@ -117,7 +117,6 @@ bool Settings::create(bool fullProfile, const amd::Isa& isa, apuSystem_ = true; } else { pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_); - stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki); } enableXNACK_ = enableXNACK; hsailExplicitXnack_ = enableXNACK; @@ -209,10 +208,6 @@ void Settings::override() { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } - if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) { - pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); - } - if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { case 0: