SWDEV-483586 - Unblock staging H2D transfers
Although unpinned copies require synchronizations
in HIP, runtime can avoid syncs for H2D copies with
a staging buffer
Change-Id: If2203c6bc0cbd89742823688dc8e89e9acd873b2
[ROCm/clr commit: 29cc678d8d]
Этот коммит содержится в:
@@ -599,7 +599,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
|
||||
return hipSuccess;
|
||||
} else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
|
||||
((srcMemory != nullptr) && (dstMemory == nullptr))) {
|
||||
isHostAsync = false;
|
||||
// Don't wait for unpinned H2D copy if staging is used for copy
|
||||
isHostAsync &= ((srcMemory == nullptr) && (dstMemory != nullptr) && AMD_DIRECT_DISPATCH &&
|
||||
(sizeBytes <= stream.device().settings().stagedXferSize_)) ? true : false;
|
||||
} else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
|
||||
hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
|
||||
srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
|
||||
|
||||
@@ -707,6 +707,8 @@ class Settings : public amd::HeapObject {
|
||||
//! Enable the specified extension
|
||||
void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
|
||||
|
||||
size_t stagedXferSize_ = 0; //!< Staged buffer size
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Settings(const Settings&);
|
||||
|
||||
@@ -98,7 +98,6 @@ class Settings : public device::Settings {
|
||||
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
|
||||
uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
size_t cpDmaCopySizeMax_; //!< Threshold for CP DMA path in copy
|
||||
|
||||
@@ -217,11 +217,10 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf,
|
||||
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
|
||||
size_t origin, size_t& offset, size_t& totalSize,
|
||||
size_t xferSize) const {
|
||||
address dst = dstMemory.getDeviceMemory();
|
||||
address staging = xferBuf.getDeviceMemory();
|
||||
|
||||
// Copy data from host to device
|
||||
dst += origin + offset;
|
||||
@@ -308,16 +307,15 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
}
|
||||
|
||||
if (dstSize != 0) {
|
||||
Memory& xferBuf = dev().xferWrite().acquire();
|
||||
address staging = gpu().Staging().Acquire(
|
||||
std::min(dstSize, dev().settings().stagedXferSize_));
|
||||
|
||||
// Write memory using a staging resource
|
||||
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
|
||||
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
|
||||
dstSize)) {
|
||||
LogError("DmaBlitManager::writeBuffer failed!");
|
||||
return false;
|
||||
}
|
||||
|
||||
gpu().addXferWrite(xferBuf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,8 +336,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
|
||||
copyMetadata);
|
||||
} else {
|
||||
Memory& xferBuf = dev().xferWrite().acquire();
|
||||
address staging = xferBuf.getDeviceMemory();
|
||||
address staging = gpu().Staging().Acquire(
|
||||
std::min(size[0], dev().settings().stagedXferSize_));
|
||||
address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
|
||||
|
||||
size_t srcOffset;
|
||||
@@ -358,7 +356,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
}
|
||||
}
|
||||
}
|
||||
gpu().addXferWrite(xferBuf);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -780,7 +777,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
|
||||
address staging, bool hostToDev) const {
|
||||
// Stall GPU, sicne CPU copy is possible
|
||||
gpu().releaseGpuMemoryFence();
|
||||
gpu().releaseGpuMemoryFence(hostToDev);
|
||||
|
||||
// No allocation is necessary for Full Profile
|
||||
hsa_status_t status;
|
||||
@@ -826,8 +823,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
gpu().Barriers().WaitCurrent();
|
||||
totalSize -= size;
|
||||
if (totalSize > 0) {
|
||||
// Wait if there are extra copies, which don't fit in a single staging buffer
|
||||
gpu().Barriers().WaitCurrent();
|
||||
}
|
||||
offset += size;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -261,7 +261,7 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
//! Write into video memory, using a staged buffer
|
||||
bool writeMemoryStaged(const void* srcHost, //!< Source host memory
|
||||
Memory& dstMemory, //!< Destination memory object
|
||||
Memory& xferBuf, //!< Staged buffer for write
|
||||
address staging, //!< Staged buffer for write
|
||||
size_t origin, //!< Original offset in the destination memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for the copy region
|
||||
|
||||
@@ -179,7 +179,6 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
, alloc_granularity_(0)
|
||||
, xferQueue_(nullptr)
|
||||
, xferRead_(nullptr)
|
||||
, xferWrite_(nullptr)
|
||||
, freeMem_(0)
|
||||
, vgpusAccess_(true) /* Virtual GPU List Ops Lock */
|
||||
, hsa_exclusive_gpu_access_(false)
|
||||
@@ -290,7 +289,6 @@ Device::~Device() {
|
||||
|
||||
// Destroy temporary buffers for read/write
|
||||
delete xferRead_;
|
||||
delete xferWrite_;
|
||||
|
||||
// Destroy transfer queue
|
||||
delete xferQueue_;
|
||||
@@ -823,15 +821,6 @@ bool Device::create() {
|
||||
mapCache_->push_back(nullptr);
|
||||
|
||||
if (settings().stagedXferSize_ != 0) {
|
||||
// Initialize staged write buffers
|
||||
if (settings().stagedXferWrite_) {
|
||||
xferWrite_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
|
||||
LogError("Couldn't allocate transfer buffer objects for read");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize staged read buffers
|
||||
if (settings().stagedXferRead_) {
|
||||
xferRead_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
|
||||
@@ -520,9 +520,6 @@ class Device : public NullDevice {
|
||||
//! Adds a map target to the cache
|
||||
bool addMapTarget(amd::Memory* memory) const;
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferWrite() const { return *xferWrite_; }
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferRead() const { return *xferRead_; }
|
||||
|
||||
@@ -653,7 +650,6 @@ class Device : public NullDevice {
|
||||
VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
|
||||
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
XferBuffers* xferWrite_; //!< Transfer buffers write
|
||||
std::atomic<size_t> freeMem_; //!< Total of free memory available
|
||||
mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
||||
bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode
|
||||
|
||||
@@ -68,7 +68,6 @@ class Settings : public device::Settings {
|
||||
uint numWaitEvents_; //!< The number of wait events for device enqueue
|
||||
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
|
||||
|
||||
@@ -1065,7 +1065,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
fence_dirty_ = true;
|
||||
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
if (!skipSignal) {
|
||||
if (!skipSignal && (signal.handle == 0)) {
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
barrier_packet_.completion_signal =
|
||||
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
@@ -1188,9 +1188,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::ResetQueueStates() {
|
||||
// Release all transfer buffers on this command queue
|
||||
releaseXferWrite();
|
||||
|
||||
// Release all memory dependencies
|
||||
memoryDependency().clear();
|
||||
|
||||
@@ -1234,6 +1231,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
schedulerSignal_({0}),
|
||||
barriers_(*this),
|
||||
kernarg_pool_signal_(KernelArgPoolNumSignal),
|
||||
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
|
||||
cuMask_(cuMask),
|
||||
priority_(priority),
|
||||
copy_command_type_(0),
|
||||
@@ -1385,9 +1383,77 @@ bool VirtualGPU::create() {
|
||||
LogError("Could not create signal for copy queue!");
|
||||
return false;
|
||||
}
|
||||
// Create managed buffer for staging copies
|
||||
if (!managed_buffer_.Create()) {
|
||||
LogError("Could not create managed buffer for this queue!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
VirtualGPU::ManagedBuffer::~ManagedBuffer() {
|
||||
for (auto& it : pool_signal_) {
|
||||
if (it.handle != 0) {
|
||||
hsa_signal_destroy(it);
|
||||
}
|
||||
}
|
||||
if (pool_base_ != nullptr) {
|
||||
gpu_.dev().hostFree(pool_base_, pool_size_);
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::ManagedBuffer::Create() {
|
||||
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
|
||||
active_chunk_ = 0;
|
||||
// Allocate memory for managed buffer
|
||||
pool_base_ = reinterpret_cast<address>(
|
||||
gpu_.dev().hostAlloc(pool_size_, 0, Device::MemorySegment::kNoAtomics));
|
||||
if (pool_base_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
hsa_agent_t agent = gpu_.dev().getBackendDevice();
|
||||
for (auto& it : pool_signal_) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 1, &agent, &it)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
|
||||
auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
|
||||
address result = nullptr;
|
||||
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
|
||||
const size_t pool_new_usage = (result + size) - pool_base_;
|
||||
if (pool_new_usage <= pool_chunk_end_) {
|
||||
pool_cur_offset_ = pool_new_usage;
|
||||
return result;
|
||||
} else {
|
||||
// Reset the signal for the barrier packet
|
||||
hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
|
||||
// Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
|
||||
constexpr bool kSkipSignal = false;
|
||||
// Dispatch a barrier packet into the queue
|
||||
gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
|
||||
// Get the next chunk
|
||||
active_chunk_ = ++active_chunk_ % kPoolNumSignals;
|
||||
// Make sure the new active chunk is free
|
||||
bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
|
||||
assert(test && "Runtime can't fail a wait for chunk!");
|
||||
// Make sure the current offset matches the new chunk to avoid possible overlaps
|
||||
// between chunks and issues during recycle
|
||||
pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
|
||||
pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals;
|
||||
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
|
||||
pool_cur_offset_ = (result + size) - pool_base_;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
@@ -3562,28 +3628,6 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
|
||||
releasePinnedMem();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::addXferWrite(Memory& memory) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
if (xferWriteBuffers_.size() > 7) {
|
||||
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
|
||||
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
|
||||
}
|
||||
|
||||
// Delay destruction
|
||||
xferWriteBuffers_.push_back(&memory);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::releaseXferWrite() {
|
||||
for (auto& memory : xferWriteBuffers_) {
|
||||
dev().xferWrite().release(*this, *memory);
|
||||
}
|
||||
xferWriteBuffers_.resize(0);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
|
||||
@@ -185,6 +185,31 @@ class Timestamp : public amd::ReferenceCountedObject {
|
||||
|
||||
class VirtualGPU : public device::VirtualDevice {
|
||||
public:
|
||||
class ManagedBuffer : public amd::EmbeddedObject {
|
||||
public:
|
||||
//! The number of chunks the arg pool will be divided
|
||||
static constexpr uint32_t kPoolNumSignals = 4;
|
||||
ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size)
|
||||
: gpu_(gpu)
|
||||
, pool_size_(pool_size)
|
||||
, pool_signal_(kPoolNumSignals) {}
|
||||
~ManagedBuffer();
|
||||
|
||||
//! Allocates all necessary resources to manage memory
|
||||
bool Create();
|
||||
|
||||
//! Acquires memory for use on the gpu
|
||||
address Acquire(uint32_t size);
|
||||
|
||||
private:
|
||||
VirtualGPU& gpu_; //!< Queue object for ROCm device
|
||||
address pool_base_ = nullptr; //!< Memory pool base address
|
||||
uint32_t pool_size_; //!< Memory pool base size
|
||||
uint32_t pool_chunk_end_ = 0; //!< The end offset of the current chunk
|
||||
uint32_t active_chunk_ = 0; //!< The index of the current active chunk
|
||||
uint32_t pool_cur_offset_ = 0; //!< Current active offset for update
|
||||
std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks
|
||||
};
|
||||
class MemoryDependency : public amd::EmbeddedObject {
|
||||
public:
|
||||
//! Default constructor
|
||||
@@ -386,11 +411,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
std::vector<device::Memory*>& wrtBackImageBuffer //!< Images for writeback
|
||||
);
|
||||
|
||||
//! Adds a stage write buffer into a list
|
||||
void addXferWrite(Memory& memory);
|
||||
|
||||
//! Releases stage write buffers
|
||||
void releaseXferWrite();
|
||||
//! Returns a managed buffer for staging copies
|
||||
ManagedBuffer& Staging() { return managed_buffer_; }
|
||||
|
||||
//! Adds a pinned memory object into a map
|
||||
void addPinnedMem(amd::Memory* mem);
|
||||
@@ -422,6 +444,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
@@ -437,10 +460,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking);
|
||||
|
||||
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
|
||||
hsa_signal_t signal = hsa_signal_t{0});
|
||||
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
|
||||
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
|
||||
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
|
||||
hsa_signal_t signal = hsa_signal_t{0});
|
||||
void dispatchBarrierValuePacket(uint16_t packetHeader,
|
||||
bool resolveDepSignal = false,
|
||||
hsa_signal_t signal = hsa_signal_t{0},
|
||||
@@ -499,7 +522,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Resets the current queue state. Note: should be called after AQL queue becomes idle
|
||||
void ResetQueueStates();
|
||||
|
||||
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
//! Queue state flags
|
||||
@@ -549,6 +571,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
std::vector<hsa_signal_t> kernarg_pool_signal_; //!< Pool of HSA signals to manage
|
||||
//!< multiple chunks
|
||||
|
||||
ManagedBuffer managed_buffer_; //!< Memory manager for staging copies
|
||||
|
||||
friend class Timestamp;
|
||||
|
||||
// PM4 packet for gfx8 performance counter
|
||||
|
||||
Ссылка в новой задаче
Block a user