SWDEV-483586 - Unblock staging H2D transfers

Although unpinned copies require synchronizations
in HIP, runtime can avoid syncs for H2D copies with
a staging buffer

Change-Id: If2203c6bc0cbd89742823688dc8e89e9acd873b2
Этот коммит содержится в:
German Andryeyev
2024-09-06 18:16:04 -04:00
родитель 2d1c6ee23e
Коммит 29cc678d8d
10 изменённых файлов: 119 добавлений и 64 удалений
+3 -1
Просмотреть файл
@@ -599,7 +599,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
return hipSuccess;
} else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
((srcMemory != nullptr) && (dstMemory == nullptr))) {
isHostAsync = false;
// Don't wait for unpinned H2D copy if staging is used for copy
isHostAsync &= ((srcMemory == nullptr) && (dstMemory != nullptr) && AMD_DIRECT_DISPATCH &&
(sizeBytes <= stream.device().settings().stagedXferSize_)) ? true : false;
} else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
+2
Просмотреть файл
@@ -707,6 +707,8 @@ class Settings : public amd::HeapObject {
//! Enable the specified extension
void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
size_t stagedXferSize_ = 0; //!< Staged buffer size
private:
//! Disable copy constructor
Settings(const Settings&);
-1
Просмотреть файл
@@ -98,7 +98,6 @@ class Settings : public device::Settings {
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t cpDmaCopySizeMax_; //!< Threshold for CP DMA path in copy
+11 -11
Просмотреть файл
@@ -217,11 +217,10 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
}
// ================================================================================================
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf,
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
size_t origin, size_t& offset, size_t& totalSize,
size_t xferSize) const {
address dst = dstMemory.getDeviceMemory();
address staging = xferBuf.getDeviceMemory();
// Copy data from host to device
dst += origin + offset;
@@ -308,16 +307,15 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
}
if (dstSize != 0) {
Memory& xferBuf = dev().xferWrite().acquire();
address staging = gpu().Staging().Acquire(
std::min(dstSize, dev().settings().stagedXferSize_));
// Write memory using a staging resource
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
dstSize)) {
LogError("DmaBlitManager::writeBuffer failed!");
return false;
}
gpu().addXferWrite(xferBuf);
}
}
@@ -338,8 +336,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
copyMetadata);
} else {
Memory& xferBuf = dev().xferWrite().acquire();
address staging = xferBuf.getDeviceMemory();
address staging = gpu().Staging().Acquire(
std::min(size[0], dev().settings().stagedXferSize_));
address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
size_t srcOffset;
@@ -358,7 +356,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
}
}
}
gpu().addXferWrite(xferBuf);
}
return true;
@@ -780,7 +777,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
address staging, bool hostToDev) const {
// Stall GPU, sicne CPU copy is possible
gpu().releaseGpuMemoryFence();
gpu().releaseGpuMemoryFence(hostToDev);
// No allocation is necessary for Full Profile
hsa_status_t status;
@@ -826,8 +823,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
}
gpu().Barriers().WaitCurrent();
totalSize -= size;
if (totalSize > 0) {
// Wait if there are extra copies, which don't fit in a single staging buffer
gpu().Barriers().WaitCurrent();
}
offset += size;
continue;
}
+1 -1
Просмотреть файл
@@ -261,7 +261,7 @@ class DmaBlitManager : public device::HostBlitManager {
//! Write into video memory, using a staged buffer
bool writeMemoryStaged(const void* srcHost, //!< Source host memory
Memory& dstMemory, //!< Destination memory object
Memory& xferBuf, //!< Staged buffer for write
address staging, //!< Staged buffer for write
size_t origin, //!< Original offset in the destination memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for the copy region
-11
Просмотреть файл
@@ -179,7 +179,6 @@ Device::Device(hsa_agent_t bkendDevice)
, alloc_granularity_(0)
, xferQueue_(nullptr)
, xferRead_(nullptr)
, xferWrite_(nullptr)
, freeMem_(0)
, vgpusAccess_(true) /* Virtual GPU List Ops Lock */
, hsa_exclusive_gpu_access_(false)
@@ -290,7 +289,6 @@ Device::~Device() {
// Destroy temporary buffers for read/write
delete xferRead_;
delete xferWrite_;
// Destroy transfer queue
delete xferQueue_;
@@ -823,15 +821,6 @@ bool Device::create() {
mapCache_->push_back(nullptr);
if (settings().stagedXferSize_ != 0) {
// Initialize staged write buffers
if (settings().stagedXferWrite_) {
xferWrite_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
LogError("Couldn't allocate transfer buffer objects for read");
return false;
}
}
// Initialize staged read buffers
if (settings().stagedXferRead_) {
xferRead_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
-4
Просмотреть файл
@@ -520,9 +520,6 @@ class Device : public NullDevice {
//! Adds a map target to the cache
bool addMapTarget(amd::Memory* memory) const;
//! Returns transfer buffer object
XferBuffers& xferWrite() const { return *xferWrite_; }
//! Returns transfer buffer object
XferBuffers& xferRead() const { return *xferRead_; }
@@ -653,7 +650,6 @@ class Device : public NullDevice {
VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
XferBuffers* xferRead_; //!< Transfer buffers read
XferBuffers* xferWrite_; //!< Transfer buffers write
std::atomic<size_t> freeMem_; //!< Total of free memory available
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode
-1
Просмотреть файл
@@ -68,7 +68,6 @@ class Settings : public device::Settings {
uint numWaitEvents_; //!< The number of wait events for device enqueue
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+70 -26
Просмотреть файл
@@ -1065,7 +1065,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
fence_dirty_ = true;
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (!skipSignal) {
if (!skipSignal && (signal.handle == 0)) {
// Get active signal for current dispatch if profiling is necessary
barrier_packet_.completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
@@ -1188,9 +1188,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
// ================================================================================================
void VirtualGPU::ResetQueueStates() {
// Release all transfer buffers on this command queue
releaseXferWrite();
// Release all memory dependencies
memoryDependency().clear();
@@ -1234,6 +1231,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerSignal_({0}),
barriers_(*this),
kernarg_pool_signal_(KernelArgPoolNumSignal),
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
cuMask_(cuMask),
priority_(priority),
copy_command_type_(0),
@@ -1385,9 +1383,77 @@ bool VirtualGPU::create() {
LogError("Could not create signal for copy queue!");
return false;
}
// Create managed buffer for staging copies
if (!managed_buffer_.Create()) {
LogError("Could not create managed buffer for this queue!");
return false;
}
return true;
}
// ================================================================================================
VirtualGPU::ManagedBuffer::~ManagedBuffer() {
for (auto& it : pool_signal_) {
if (it.handle != 0) {
hsa_signal_destroy(it);
}
}
if (pool_base_ != nullptr) {
gpu_.dev().hostFree(pool_base_, pool_size_);
}
}
// ================================================================================================
bool VirtualGPU::ManagedBuffer::Create() {
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
active_chunk_ = 0;
// Allocate memory for managed buffer
pool_base_ = reinterpret_cast<address>(
gpu_.dev().hostAlloc(pool_size_, 0, Device::MemorySegment::kNoAtomics));
if (pool_base_ == nullptr) {
return false;
}
hsa_agent_t agent = gpu_.dev().getBackendDevice();
for (auto& it : pool_signal_) {
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 1, &agent, &it)) {
return false;
}
}
return true;
}
// ================================================================================================
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
address result = nullptr;
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
const size_t pool_new_usage = (result + size) - pool_base_;
if (pool_new_usage <= pool_chunk_end_) {
pool_cur_offset_ = pool_new_usage;
return result;
} else {
// Reset the signal for the barrier packet
hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
// Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
constexpr bool kSkipSignal = false;
// Dispatch a barrier packet into the queue
gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
// Get the next chunk
active_chunk_ = ++active_chunk_ % kPoolNumSignals;
// Make sure the new active chunk is free
bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
assert(test && "Runtime can't fail a wait for chunk!");
// Make sure the current offset matches the new chunk to avoid possible overlaps
// between chunks and issues during recycle
pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals;
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
pool_cur_offset_ = (result + size) - pool_base_;
}
return result;
}
// ================================================================================================
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
@@ -3562,28 +3628,6 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
releasePinnedMem();
}
// ================================================================================================
void VirtualGPU::addXferWrite(Memory& memory) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (xferWriteBuffers_.size() > 7) {
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
}
// Delay destruction
xferWriteBuffers_.push_back(&memory);
}
// ================================================================================================
void VirtualGPU::releaseXferWrite() {
for (auto& memory : xferWriteBuffers_) {
dev().xferWrite().release(*this, *memory);
}
xferWriteBuffers_.resize(0);
}
// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+32 -8
Просмотреть файл
@@ -185,6 +185,31 @@ class Timestamp : public amd::ReferenceCountedObject {
class VirtualGPU : public device::VirtualDevice {
public:
class ManagedBuffer : public amd::EmbeddedObject {
public:
//! The number of chunks the arg pool will be divided
static constexpr uint32_t kPoolNumSignals = 4;
ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size)
: gpu_(gpu)
, pool_size_(pool_size)
, pool_signal_(kPoolNumSignals) {}
~ManagedBuffer();
//! Allocates all necessary resources to manage memory
bool Create();
//! Acquires memory for use on the gpu
address Acquire(uint32_t size);
private:
VirtualGPU& gpu_; //!< Queue object for ROCm device
address pool_base_ = nullptr; //!< Memory pool base address
uint32_t pool_size_; //!< Memory pool base size
uint32_t pool_chunk_end_ = 0; //!< The end offset of the current chunk
uint32_t active_chunk_ = 0; //!< The index of the current active chunk
uint32_t pool_cur_offset_ = 0; //!< Current active offset for update
std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks
};
class MemoryDependency : public amd::EmbeddedObject {
public:
//! Default constructor
@@ -386,11 +411,8 @@ class VirtualGPU : public device::VirtualDevice {
std::vector<device::Memory*>& wrtBackImageBuffer //!< Images for writeback
);
//! Adds a stage write buffer into a list
void addXferWrite(Memory& memory);
//! Releases stage write buffers
void releaseXferWrite();
//! Returns a managed buffer for staging copies
ManagedBuffer& Staging() { return managed_buffer_; }
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
@@ -422,6 +444,7 @@ class VirtualGPU : public device::VirtualDevice {
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
// } roc OpenCL integration
private:
//! Dispatches a barrier with blocking HSA signals
@@ -437,10 +460,10 @@ class VirtualGPU : public device::VirtualDevice {
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
hsa_signal_t signal = hsa_signal_t{0});
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
hsa_signal_t signal = hsa_signal_t{0});
void dispatchBarrierValuePacket(uint16_t packetHeader,
bool resolveDepSignal = false,
hsa_signal_t signal = hsa_signal_t{0},
@@ -499,7 +522,6 @@ class VirtualGPU : public device::VirtualDevice {
//! Resets the current queue state. Note: should be called after AQL queue becomes idle
void ResetQueueStates();
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
//! Queue state flags
@@ -549,6 +571,8 @@ class VirtualGPU : public device::VirtualDevice {
std::vector<hsa_signal_t> kernarg_pool_signal_; //!< Pool of HSA signals to manage
//!< multiple chunks
ManagedBuffer managed_buffer_; //!< Memory manager for staging copies
friend class Timestamp;
// PM4 packet for gfx8 performance counter