SWDEV-510849 - Restore pinned memory copy path

1) Create getBuffer method to return pinned host memory or staging buffer
2) for D2H path use managed buffer instead of static buffer
3) use staging buffer copy for 16KB < size < 1MB
4) use pinned memory copy for size > 1MB

Change-Id: I13d4d6ab60691bc6c7724239db1e11e23f0f3dc2
This commit is contained in:
Rahul Manocha
2025-01-21 16:31:24 -08:00
committed by Rahul Manocha
parent 187648429b
commit 4bf634dfca
3 changed files with 236 additions and 233 deletions
+208 -218
View File
@@ -30,7 +30,9 @@
namespace amd::roc {
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
: HostBlitManager(gpu, setup),
MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
MinSizeForPinnedXfer(dev().settings().pinnedMinXferSize_),
PinXferSize(dev().settings().pinnedXferSize_),
StagingXferSize(dev().settings().stagedXferSize_),
completeOperation_(false),
context_(nullptr) {
dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
@@ -57,20 +59,19 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
} else {
size_t copySize = size[0];
if (0 != copySize) {
const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
address addrDst = reinterpret_cast<address>(dstHost);
constexpr bool kHostToDev = false;
if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) {
LogError("DmaBlitManager::readBuffer staged copy failed!");
return false;
}
}
}
size_t copySize = size[0];
if (copySize > 0) {
const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
address addrDst = reinterpret_cast<address>(dstHost);
constexpr bool kHostToDev = false;
constexpr bool kEnablePin = true;
if (!hsaCopyStagedOrPinned(addrSrc, addrDst, copySize, kHostToDev, copyMetadata, kEnablePin)) {
LogError("DmaBlitManager:: readBuffer copy failure!");
return false;
}
}
return true;
}
@@ -100,7 +101,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
// Copy data from device to host - line by line
address dst = reinterpret_cast<address>(dstHost) + dstOffset;
bool retval = hsaCopyStaged(src + srcOffset, dst, size[0],
bool retval = hsaCopyStagedOrPinned(src + srcOffset, dst, size[0],
false, copyMetadata);
if (!retval) {
return retval;
@@ -142,23 +143,18 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
} else {
size_t copySize = size[0];
// For small copies use managed staging buffers which can be non blocking
if (copySize != 0) {
address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
// Write memory using a staging resource
constexpr bool kHostToDev = true;
bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata);
if (!result) {
LogError("DmaBlitManager::writeBuffer staging copy failed!");
return false;
}
}
size_t copySize = size[0];
if (copySize > 0) {
address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
constexpr bool kHostToDev = true;
constexpr bool enablePin = true;
if (!hsaCopyStagedOrPinned(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata, enablePin)) {
LogError("DmaBlitManager:: writeBuffer copy failure!");
return false;
}
}
return true;
}
@@ -188,7 +184,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
// Copy data from host to device - line by line
const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
constexpr bool kHostToDev = true;
bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata);
bool retval = hsaCopyStagedOrPinned(src, dst + dstOffset, size[0], kHostToDev,
copyMetadata);
if (!retval) {
return retval;
}
@@ -503,6 +500,9 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
copyMask = gpu().getLastUsedSdmaEngine();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
copyMask &= (engine == HwQueueEngine::SdmaRead ?
sdmaEngineReadMask_ : sdmaEngineWriteMask_);
if (copyMask == 0) {
// Check SDMA engine status
status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
@@ -584,71 +584,117 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata);
}
// ================================================================================================
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
bool hostToDev, amd::CopyMetadata& copyMetadata) const {
gpu().releaseGpuMemoryFence(kSkipCpuWait);
// Get Staging or Pinned memory buffer
void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
bool enablePin, bool first_tx,
DmaBlitManager::BufferState &buffState) const {
bool doHostPinning = enablePin && ( size > MinSizeForPinnedXfer);
size_t copyChunkSize = doHostPinning ? PinXferSize : StagingXferSize;
size_t xferSize = std::min(size, copyChunkSize);
size_t totalSize = size;
size_t stagedCopyOffset = 0;
bool status = true;
Memory* xferBuf = nullptr;
address stagingBuffer = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
if (!hostToDev) {
// Get static staging buffer as we need to wait until copy on GPU completes to copy
// it back to the unpinned buffer
xferBuf = &dev().xferRead().acquire();
stagingBuffer = xferBuf->getDeviceMemory();
if (doHostPinning) { // Pin host Memory
char* alignedHost = reinterpret_cast<char *>(const_cast<unsigned char *>(hostMem));
size_t partial1 = 0;
size_t partial2 = 0;
if (xferSize > PinXferSize && first_tx) {
//Align to 4K boundary
alignedHost = const_cast<char *>(amd::alignDown(reinterpret_cast<const char*>(hostMem),
PinnedMemoryAlignment));
// Find partial size of unaligned copy
partial2 = reinterpret_cast<const char*>(hostMem) - alignedHost;
size_t tmpSize = amd::alignUp(PinXferSize + partial2, PinnedMemoryAlignment);
xferSize = std::min(tmpSize - partial2, size);
}
amd::Memory* pinnedMem = pinHostMemory(alignedHost, xferSize, partial1);
if (pinnedMem != nullptr) {
Memory* pinnedMemory = dev().getRocMemory(pinnedMem);
address pinBuffer = pinnedMemory->getDeviceMemory();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", xferSize);
buffState.copySize_ = xferSize;
buffState.buffer_ = pinBuffer + partial1 + partial2;
buffState.pinnedMem_ = pinnedMem;
return;
}
LogWarning("DmaBlitManager::getBuffer failed to pin a resource!");
}
// If Memory Pinning fails, failback to staging buffer
xferSize = std::min(xferSize, StagingXferSize);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", xferSize);
buffState.copySize_ = xferSize;
buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
}
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, maxStagedXferSize);
void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
if (buffer.pinnedMem_) {
gpu().addPinnedMem(buffer.pinnedMem_);
}
}
hsa_agent_t srcAgent;
hsa_agent_t dstAgent;
// ================================================================================================
bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDst,
size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
bool enablePin) const {
gpu().releaseGpuMemoryFence(kSkipCpuWait);
// If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
// For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
bool status = true;
size_t copyOffset = 0;
size_t totalSize = size;
// Copy data from Host to Device
if (hostToDev) {
hsa_agent_t srcAgent = dev().getCpuAgent();
hsa_agent_t dstAgent = dev().getBackendDevice();
// Get an address from managed staging buffer
stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize));
address dst = hostDst + stagedCopyOffset;
memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size);
// Staging Buffer or Pinned Host Memory
address stagingBuffer = 0;
// src and dst agent for rocr
hsa_agent_t srcAgent = hostToDev ? dev().getCpuAgent() : dev().getBackendDevice();
hsa_agent_t dstAgent = hostToDev ? dev().getBackendDevice() : dev().getCpuAgent();
bool firstTx = true;
while(totalSize > 0) {
size_t outsize = totalSize;
const_address hostmem = hostToDev ? hostSrc : hostDst;
// Get Pinned Host Memory or Staging buffer based on copy size
BufferState buffer{0};
getBuffer(static_cast<const_address>(hostmem + copyOffset), outsize,
enablePin, firstTx, buffer);
size_t copysize = buffer.copySize_;
address stagingBuffer = buffer.buffer_;
if (stagingBuffer == 0) {
LogWarning("DmaBlitManager::hsaCopyStagedOrPinned Buffer creation failed!");
status = false;
break;
}
if (hostToDev) { // H2D Path
if (buffer.pinnedMem_ == nullptr) { // Copy to Staging Buffer
memcpy(stagingBuffer, hostSrc + copyOffset, copysize);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D");
status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata);
address dst = hostDst + copyOffset;
status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, copysize, copyMetadata);
if (!status) {
break;
}
} else {
dstAgent = dev().getCpuAgent();
srcAgent = dev().getBackendDevice();
const_address src = static_cast<const_address>(hostSrc) + stagedCopyOffset;
} else { // D2H Path
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H");
status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata);
if (status) {
gpu().Barriers().WaitCurrent();
memcpy(hostDst + stagedCopyOffset, stagingBuffer, size);
const_address src = static_cast<const_address>(hostSrc) + copyOffset;
status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
if (status ) {
if (buffer.pinnedMem_ == nullptr) { // Blocking copy from Staging Buffer
gpu().Barriers().WaitCurrent();
memcpy(hostDst + copyOffset, stagingBuffer, copysize);
}
} else {
break;
}
}
totalSize -= size;
stagedCopyOffset += size;
// Release Pinned Memory back to pool
releaseBuffer(buffer);
// Update Offset and Transfer Size
copyOffset += copysize;
totalSize -= copysize;
firstTx = false;
}
if (!hostToDev) {
dev().xferRead().release(gpu(), *xferBuf);
}
if (!status) {
if(!status) {
return false;
}
@@ -656,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
return true;
}
// ================================================================================================
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
@@ -1617,87 +1662,61 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
synchronize();
return result;
} else {
size_t totalSize = size[0];
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned read path, Async = %d",
copyMetadata.isAsync_);
// Check if a pinned transfer can be executed with a single pin
if (((totalSize <= dev().settings().pinnedXferSize_) &&
(totalSize > MinSizeForPinnedTransfer))) {
size_t partial;
amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial);
size_t totalSize = size[0];
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (amdMemory == nullptr) {
// Force SW copy
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire,
copyMetadata);
synchronize();
return result;
}
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
entire, copyMetadata);
}
if (!result) {
// Blit copy using a staging resource
address srcAddr = gpuMem(srcMemory).getDeviceMemory();
address dstAddr = reinterpret_cast<address>(dstHost);
amd::Coord3D dstOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
Memory& xferBuf = dev().xferRead().acquire();
address xferBufAddr = xferBuf.getDeviceMemory();
// Readjust host mem offset
amd::Coord3D dstOrigin(partial);
// Get device memory for this virtual device
Memory* dstMemory = dev().getRocMemory(amdMemory);
// Copy image to buffer
result = copyBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire, copyMetadata);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
} else {
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
entire, copyMetadata);
}
if (!result) {
// Blit copy using a staging resource
address srcAddr = gpuMem(srcMemory).getDeviceMemory();
address dstAddr = reinterpret_cast<address>(dstHost);
amd::Coord3D dstOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
Memory& xferBuf = dev().xferRead().acquire();
address xferBufAddr = xferBuf.getDeviceMemory();
constexpr bool kAttachSignal = true;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
srcAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
"dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
// Flush caches for coherency after the copy as we need to std::memcpy
// from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
// itself that we can wait on without extra barrier packet.
gpu().addSystemScope();
result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata,
kAttachSignal);
if (!result) {
break;
}
// Wait on current signal of previous blit copy
gpu().Barriers().WaitCurrent();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
(void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
totalSize -= copySize;
stagedCopyOffset += copySize;
constexpr bool kAttachSignal = true;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
srcAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
"dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
// Flush caches for coherency after the copy as we need to std::memcpy
// from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
// itself that we can wait on without extra barrier packet.
gpu().addSystemScope();
result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata,
kAttachSignal);
if (!result) {
break;
}
dev().xferRead().release(gpu(), xferBuf);
// Wait on current signal of previous blit copy
gpu().Barriers().WaitCurrent();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
(void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
totalSize -= copySize;
stagedCopyOffset += copySize;
}
dev().xferRead().release(gpu(), xferBuf);
}
}
synchronize();
return result;
@@ -1773,79 +1792,50 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
synchronize();
return result;
} else {
size_t totalSize = size[0];
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path, Async = %d",
copyMetadata.isAsync_);
// If size > min pinned size, do a pinning copy, since we are limited by staging buffer size
// Check if a pinned transfer can be executed with a single pin
if ((totalSize <= dev().settings().pinnedXferSize_) &&
(totalSize > MinSizeForPinnedTransfer)) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize);
size_t partial;
amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial);
size_t totalSize = size[0];
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (amdMemory == nullptr) {
// Force SW copy
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
size, entire, copyMetadata);
synchronize();
return result;
}
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
size, entire, copyMetadata);
}
// Readjust destination offset
const amd::Coord3D srcOrigin(partial);
if (!result) {
// Blit copy using a staging resource
address dstAddr = gpuMem(dstMemory).getDeviceMemory();
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
amd::Coord3D srcOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
// Get device memory for this virtual device
Memory* srcMemory = dev().getRocMemory(amdMemory);
// Copy buffer
result = copyBuffer(*srcMemory, dstMemory, srcOrigin, origin, size, entire, copyMetadata);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
} else {
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
size, entire, copyMetadata);
}
if (!result) {
// Blit copy using a staging resource
address dstAddr = gpuMem(dstMemory).getDeviceMemory();
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
amd::Coord3D srcOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
// Get an address from managed staging buffer
address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
dstAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
"dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
// No cache flush is needed here as we use a staging buffer, and the acquire logic
// ensures that the cacheline is different and re-used only when L2 is flushed
result = shaderCopyBuffer(dstAddr, stagingBuffer,
origin, srcOrigin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata);
if (!result) {
break;
}
totalSize -= copySize;
stagedCopyOffset += copySize;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
// Get an address from managed staging buffer
address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
dstAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
"dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
// No cache flush is needed here as we use a staging buffer, and the acquire logic
// ensures that the cacheline is different and re-used only when L2 is flushed
result = shaderCopyBuffer(dstAddr, stagingBuffer,
origin, srcOrigin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata);
if (!result) {
break;
}
totalSize -= copySize;
stagedCopyOffset += copySize;
}
}
}
+28 -10
View File
@@ -208,7 +208,6 @@ class DmaBlitManager : public device::HostBlitManager {
}
protected:
static constexpr uint MaxPinnedBuffers = 4;
//! Synchronizes the blit operations if necessary
inline void synchronize() const;
@@ -237,7 +236,10 @@ class DmaBlitManager : public device::HostBlitManager {
const_address src, hsa_agent_t& srcAgent, size_t size,
amd::CopyMetadata& copyMetadata) const;
const size_t MinSizeForPinnedTransfer;
const size_t PinXferSize; //!< Copy size for Pinned Copy
const size_t MinSizeForPinnedXfer; //!< Mininum copy size for Pinned Copy
const size_t StagingXferSize; //!< Copy size for Staging Buffer Copy
bool completeOperation_; //!< DMA blit manager must complete operation
amd::Context* context_; //!< A dummy context
uint32_t sdmaEngineReadMask_; //!< SDMA Engine Read Mask
@@ -250,14 +252,30 @@ class DmaBlitManager : public device::HostBlitManager {
//! Disable operator=
DmaBlitManager& operator=(const DmaBlitManager&);
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
address hostDst, //!< Destination buffer address for copying
size_t size, //!< Size of data to copy in bytes
bool hostToDev, //!< True if data is copied from H2D
amd::CopyMetadata& copyMetadata //!< Memory copy MetaData
) const;
bool hsaCopyStagedOrPinned(const_address hostSrc, //!< Src buffer address
address hostDst, //!< Dst Buffer address
size_t size, //!< Size of copy data in bytes
bool hostToDev, //!< True for H2D copy
amd::CopyMetadata& copyMetadata, //!< copy MetaData
bool enPinning = false //!< True if pinning required
) const;
struct BufferState{
address buffer_; //!< Staging Buffer or Pinned Host Mem Address
amd::Memory* pinnedMem_; //!< Pinned Memory
size_t copySize_; //!< last copy size
};
// Get Pinned Host Memory or Staging Buffer
void getBuffer(const_address hostMem, //!< Host Mem Address
size_t size, //!< Transfer Size
bool enablePin, //!< True when Pinning is enabled
bool first_tx, //!< True for first copy
BufferState &buffer //!< State of Buffer
) const;
// Release Pinned host memory
void releaseBuffer(BufferState &buff //!< True if last copy used Pinned resource
) const;
};
//! Kernel Blit Manager
-5
View File
@@ -117,7 +117,6 @@ bool Settings::create(bool fullProfile, const amd::Isa& isa,
apuSystem_ = true;
} else {
pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_);
stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki);
}
enableXNACK_ = enableXNACK;
hsailExplicitXnack_ = enableXNACK;
@@ -209,10 +208,6 @@ void Settings::override() {
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
}
if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) {
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
}
if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
case 0: