From 141d36d8491dd9cdecb0c2f548b8dd51ff13f898 Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 20 Apr 2018 17:08:29 -0400
Subject: [PATCH] P4 to Git Change 1544622 by gandryey@gera-w8 on 2018/04/20
17:02:52
SWDEV-79445 - OCL generic changes and code clean-up
- Add managed buffer support and replace all uploads with the managed buffer allocations
- Add staging copy for small image writes
- Replace constant buffer in FillBuffer with a managed buffer also
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#84 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#26 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#92 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#48 edit
[ROCm/clr commit: 392724cc3f85cee05249e622e34564e51706e340]
---
.../clr/rocclr/runtime/device/pal/palblit.cpp | 113 ++++++++----------
.../clr/rocclr/runtime/device/pal/palblit.hpp | 5 +-
.../rocclr/runtime/device/pal/palconstbuf.cpp | 59 ++++++++-
.../rocclr/runtime/device/pal/palconstbuf.hpp | 57 +++++++--
.../rocclr/runtime/device/pal/paldevice.cpp | 17 ---
.../rocclr/runtime/device/pal/paldevice.hpp | 4 -
.../rocclr/runtime/device/pal/palprogram.cpp | 6 +-
.../rocclr/runtime/device/pal/palresource.cpp | 15 +--
.../rocclr/runtime/device/pal/palvirtual.cpp | 34 +-----
.../rocclr/runtime/device/pal/palvirtual.hpp | 13 +-
10 files changed, 175 insertions(+), 148 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index c32a6aa687..e392c018f2 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -256,34 +256,36 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const a
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf,
size_t origin, size_t& offset, size_t& totalSize,
size_t xferSize) const {
- amd::Coord3D src(0, 0, 0);
size_t chunkSize;
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
// @todo Blocking write requires a flush to start earlier,
// but currently VDI doesn't provide that info
- static const bool FlushDMA = false;
+ bool flushDMA = false;
- if (dev().xferRead().bufSize() < 128 * Ki) {
- chunkSize = dev().xferWrite().bufSize();
+ if (gpu().xferWrite().MaxSize() < 128 * Ki) {
+ chunkSize = gpu().xferWrite().MaxSize();
} else {
- chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferWrite().bufSize());
+ chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
chunkSize = std::max(chunkSize, 128 * Ki);
+ bool flushDMA = true;
}
while (xferSize != 0) {
// Find the partial transfer size
size_t tmpSize = std::min(chunkSize, xferSize);
+ amd::Coord3D src(offset, 0, 0);
amd::Coord3D dst(origin + offset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary buffer, using CPU
- if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize)) {
+ if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset,
+ src, copySize, Resource::NoWait)) {
return false;
}
// Copy data into the original destination memory
- if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) {
+ if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, flushDMA)) {
return false;
}
@@ -365,7 +367,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
}
if (dstSize != 0) {
- Memory& xferBuf = dev().xferWrite().acquire();
+ Memory& xferBuf = gpu().xferWrite().Acquire(dstSize);
// Write memory using a staged resource
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
@@ -374,7 +376,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
return false;
}
- gpu().addXferWrite(xferBuf);
+ gpu().xferWrite().Release(xferBuf);
}
}
@@ -392,7 +394,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
} else {
- Memory& xferBuf = dev().xferWrite().acquire();
+ Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0]));
amd::Coord3D src(0, 0, 0);
size_t tmpSize = 0;
@@ -408,7 +410,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
while (dstSize != 0) {
// Find the partial transfer size
- tmpSize = std::min(dev().xferWrite().bufSize(), dstSize);
+ tmpSize = std::min(gpu().xferWrite().MaxSize(), dstSize);
amd::Coord3D dst(bufOffset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
@@ -432,7 +434,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
}
}
}
- gpu().addXferWrite(xferBuf);
+ gpu().xferWrite().Release(xferBuf);
}
return true;
@@ -576,8 +578,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
entire, rowPitch, slicePitch);
} else {
// Use PAL path for a transfer
- result =
- gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
+ result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin,
+ size, gpuMem(dstMemory));
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
@@ -607,9 +609,8 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
program_(NULL),
- constantBuffer_(NULL),
xferBufferSize_(0),
- lockXferOps_(NULL) {
+ lockXferOps_("Transfer Ops Lock", true) {
for (uint i = 0; i < BlitTotal; ++i) {
kernels_[i] = NULL;
}
@@ -636,17 +637,11 @@ KernelBlitManager::~KernelBlitManager() {
context_->release();
}
- if (NULL != constantBuffer_) {
- constantBuffer_->release();
- }
-
for (uint i = 0; i < MaxXferBuffers; ++i) {
if (NULL != xferBuffers_[i]) {
xferBuffers_[i]->release();
}
}
-
- delete lockXferOps_;
}
bool KernelBlitManager::create(amd::Device& device) {
@@ -693,19 +688,6 @@ bool KernelBlitManager::createProgram(Device& device) {
result = true;
} while (!result);
- // Create an internal constant buffer
- constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
-
- if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
- constantBuffer_->release();
- constantBuffer_ = NULL;
- return false;
- } else if (constantBuffer_ == NULL) {
- return false;
- }
-
- // Assign the constant buffer to the current virtual GPU
- constantBuffer_->setVirtualDevice(&gpu());
if (dev().settings().xferBufSize_ > 0) {
xferBufferSize_ = dev().settings().xferBufSize_;
@@ -734,11 +716,6 @@ bool KernelBlitManager::createProgram(Device& device) {
}
}
- lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
- if (NULL == lockXferOps_) {
- return false;
- }
-
return result;
}
@@ -1685,30 +1662,43 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
} else {
size_t pinSize;
FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
+ size_t partial = 0;
+ bool pinned;
- size_t partial;
- amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
-
- if (amdMemory == NULL) {
- // Force SW copy
- result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
- entire);
- synchronize();
- return result;
+ amd::Memory* amdMemory = nullptr;
+ Memory* srcMemory;
+ if (pinSize > gpu().xferWrite().MaxSize()) {
+ amdMemory = pinHostMemory(srcHost, pinSize, partial);
+ if (amdMemory == nullptr) {
+ // Force SW copy
+ result = HostBlitManager::writeImage(srcHost, dstMemory,
+ origin, size, rowPitch, slicePitch, entire);
+ synchronize();
+ return result;
+ }
+ // Get device memory for this virtual device
+ srcMemory = dev().getGpuMemory(amdMemory);
+ pinned = true;
+ }
+ else {
+ srcMemory = &gpu().xferWrite().Acquire(pinSize);
+ srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait);
+ pinned = false;
}
// Readjust destination offset
const amd::Coord3D srcOrigin(partial);
- // Get device memory for this virtual device
- Memory* srcMemory = dev().getGpuMemory(amdMemory);
-
// Copy image to buffer
result = copyBufferToImage(*srcMemory, dstMemory, srcOrigin, origin, size, entire, rowPitch,
slicePitch);
- // Add pinned memory for a later release
- gpu().addPinnedMem(amdMemory);
+ if (pinned) {
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
+ } else {
+ gpu().xferWrite().Release(*srcMemory);
+ }
}
synchronize();
@@ -2054,14 +2044,12 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
}
- Memory* gpuCB = dev().getGpuMemory(constantBuffer_);
- if (gpuCB == NULL) {
- return false;
- }
- void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly);
+ Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
+ void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
memcpy(constBuf, pattern, patternSize);
- gpuCB->unmap(&gpu());
- setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
+ gpuCB.unmap(&gpu());
+ Memory* pGpuCB = &gpuCB;
+ setArgument(kernels_[fillType], 2, sizeof(cl_mem), &pGpuCB);
cl_ulong offset = origin[0];
if (dwordAligned) {
patternSize /= sizeof(uint32_t);
@@ -2077,6 +2065,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
// Execute the blit
address parameters = kernels_[fillType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
+ gpu().xferWrite().Release(gpuCB);
}
synchronize();
@@ -2137,12 +2126,10 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Program source origin
cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
- ;
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
// Program destinaiton origin
cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
- ;
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
cl_ulong copySize = size[0];
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
index ece29da86d..fe52ac2a59 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
@@ -352,7 +352,7 @@ class KernelBlitManager : public DmaBlitManager {
const void* data //!< Raw data pointer
) const;
- virtual amd::Monitor* lockXfer() const { return lockXferOps_; }
+ virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
private:
static const size_t MaxXferBuffers = 2;
@@ -397,10 +397,9 @@ class KernelBlitManager : public DmaBlitManager {
amd::Program* program_; //!< GPU program obejct
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
- amd::Memory* constantBuffer_; //!< An internal CB for blits
amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
size_t xferBufferSize_; //!< Transfer buffer size
- amd::Monitor* lockXferOps_; //!< Lock transfer operation
+ mutable amd::Monitor lockXferOps_; //!< Lock transfer operation
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
index b5a266fe43..bffa902e0a 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -19,7 +19,7 @@ ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
, wrtAddress_(nullptr) {}
// ================================================================================================
-ManagedBuffer::~ManagedBuffer() {
+void ManagedBuffer::release() {
for (auto it : buffers_) {
if (it->data() != nullptr) {
it->unmap(&gpu_);
@@ -72,13 +72,26 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
*gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_;
address cpu_address = wrtAddress_ + wrtOffset_;
-
+
// Adjust the offset by the reserved size
wrtOffset_ += count;
return cpu_address;
}
+// ================================================================================================
+Memory& ManagedBuffer::reserveAtTheTop(uint32_t size)
+{
+ // Get the next buffer in the list
+ ++activeBuffer_;
+ activeBuffer_ %= MaxNumberOfBuffers;
+ // Make sure the buffer isn't busy
+ buffers_[activeBuffer_]->wait(gpu_);
+ wrtAddress_ = buffers_[activeBuffer_]->data();
+ wrtOffset_ = 0;
+ return *buffers_[activeBuffer_];
+}
+
// ================================================================================================
ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
: mbuf_(mbuf)
@@ -114,11 +127,47 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
+ uint64_t vm_address;
+ address cpu_address = mbuf_.reserve(size, &vm_address);
+ // Update memory with new CB data
+ memcpy(cpu_address, sysmem, size);
+ return vm_address;
+}
+
+// ================================================================================================
+XferBuffer::XferBuffer(ManagedBuffer& mbuf, uint32_t size)
+ : mbuf_(mbuf)
+ , size_(size)
+{}
+
+// ================================================================================================
+Memory& XferBuffer::Acquire(uint32_t size) const
+{
uint64_t vm_address;
+ // Reserve space in the managed buffer
address cpu_address = mbuf_.reserve(size, &vm_address);
- // Update memory with new CB data
- memcpy(cpu_address, sysmem, size);
- return vm_address;
+ // Create a view for access
+ Memory* mem = new Memory(mbuf_.gpu().dev(), static_cast(size));
+ Resource::ViewParams params = {};
+ params.gpu_ = &mbuf_.gpu();
+ params.offset_ = vm_address - mbuf_.vmAddress();
+ params.size_ = size;
+ params.resource_ = mbuf_.activeMemory();
+ if (nullptr == mem || !mem->create(Resource::View, ¶ms)) {
+ delete mem;
+ // If the suballocaiton failed for some reason, then return the top of the active buffer
+ return mbuf_.reserveAtTheTop(size);
+ }
+ return *mem;
+}
+
+// ================================================================================================
+void XferBuffer::Release(Memory& mem) const
+{
+ // Delete view
+ if (mem.desc().type_ == Resource::View) {
+ delete &mem;
+ }
}
} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
index 5b6cb2af1b..5ab7d5d67f 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
@@ -9,19 +9,20 @@
namespace pal {
//! Managed buffer (staging or constant)
-class ManagedBuffer : public amd::HeapObject {
+class ManagedBuffer : public amd::EmbeddedObject {
public:
//! Constructor for the ConstBuffer class
ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object
uint32_t size //!< size of the managed buffers in bytes
);
+ ~ManagedBuffer() {}
- //! Destructor for the ConstBuffer class
- ~ManagedBuffer();
-
- //! Creates the real HW constant buffer
+ //! Creates the managed buffers
bool create(Resource::MemoryType type);
+ //! Release the managed buffers
+ void release();
+
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
*
* \return True if the data upload was succesful
@@ -29,6 +30,9 @@ class ManagedBuffer : public amd::HeapObject {
address reserve(uint32_t size, //!< real data size for upload
uint64_t* gpu_address);
+ //! Reserves memory at the top of the active buffer
+ Memory& reserveAtTheTop(uint32_t size);
+
//! Returns CB size
uint32_t size() const { return size_; }
@@ -40,6 +44,9 @@ class ManagedBuffer : public amd::HeapObject {
uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); }
+ //! Returns VirtualGPU object this managed resource associated
+ VirtualGPU& gpu() const { return gpu_; }
+
private:
//! The maximum number of the managed buffers
static constexpr uint32_t MaxNumberOfBuffers = 3;
@@ -63,13 +70,13 @@ class ConstantBuffer : public amd::HeapObject {
public:
//! Constructor for the ConstBuffer class
ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer
- uint32_t size
+ uint32_t size //!< Max size of the constant buffer
);
//! Destructor for the ConstBuffer class
~ConstantBuffer();
- //! Creates the real HW constant buffer
+ //! Creates the HW constant buffer
bool Create();
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
@@ -88,7 +95,7 @@ public:
) const;
//! Returns a pointer to the system memory copy for CB
- address SysMemCopy(uint32_t size = 0) const { return sys_mem_copy_; }
+ address SysMemCopy() const { return sys_mem_copy_; }
//! Returns active GPU buffer
Memory* ActiveMemory() const { return mbuf_.activeMemory(); }
@@ -105,4 +112,38 @@ private:
uint32_t size_; //!< Constant buffer size
};
+//! Staging buffer
+class XferBuffer : public amd::EmbeddedObject {
+public:
+ //! Constructor for the ConstBuffer class
+ XferBuffer(ManagedBuffer& mbuf, //!< Managed buffer
+ uint32_t size //!< Maximum size of the transfer buffer
+ );
+
+ //! Destructor for the ConstBuffer class
+ ~XferBuffer() {}
+
+ /*! \brief Acquires free memory from the managed buffer
+ *
+ * \return GPU memory object associated with free memory
+ */
+ Memory& Acquire(uint32_t size //!< data size for transfers
+ ) const;
+
+ //! Releases memory object used in the staging transfer
+ void Release(Memory& mem //!< Memory object for release
+ ) const;
+
+ size_t MaxSize() const { return static_cast(size_); }
+
+private:
+ //! Disable copy constructor
+ XferBuffer(const XferBuffer&) = delete;
+
+ //! Disable operator=
+ XferBuffer& operator=(const XferBuffer&) = delete;
+
+ ManagedBuffer& mbuf_; //!< Managed buffer on GPU
+ uint32_t size_; //!< Mx staging buffer size
+};
/*@}*/} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index 7d25d22ae6..61bdd6e91c 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -681,7 +681,6 @@ Device::Device()
scratchAlloc_(nullptr),
mapCacheOps_(nullptr),
xferRead_(nullptr),
- xferWrite_(nullptr),
mapCache_(nullptr),
resourceCache_(nullptr),
numComputeEngines_(0),
@@ -732,7 +731,6 @@ Device::~Device() {
// Destroy temporary buffers for read/write
delete xferRead_;
- delete xferWrite_;
// Destroy resource cache
delete resourceCache_;
@@ -986,21 +984,6 @@ bool Device::initializeHeapResources() {
}
if (settings().stagedXferSize_ != 0) {
- // Initialize staged write buffers
- if (settings().stagedXferWrite_) {
- Resource::MemoryType type;
- if (settings().stagingWritePersistent_ && !settings().disablePersistent_) {
- type = Resource::Persistent;
- } else {
- type = Resource::RemoteUSWC;
- }
- xferWrite_ = new XferBuffers(*this, type, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
- if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
- LogError("Couldn't allocate transfer buffer objects for read");
- return false;
- }
- }
-
// Initialize staged read buffers
if (settings().stagedXferRead_) {
xferRead_ = new XferBuffers(*this, Resource::Remote,
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index da4dbdcae9..0d622dae45 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -396,9 +396,6 @@ class Device : public NullDevice {
pal::Memory* createScratchBuffer(size_t size //!< Size of buffer
) const;
- //! Returns transfer buffer object
- XferBuffers& xferWrite() const { return *xferWrite_; }
-
//! Returns transfer buffer object
XferBuffers& xferRead() const { return *xferRead_; }
@@ -588,7 +585,6 @@ class Device : public NullDevice {
amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation
amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
XferBuffers* xferRead_; //!< Transfer buffers read
- XferBuffers* xferWrite_; //!< Transfer buffers write
std::vector* mapCache_; //!< Map cache info structure
ResourceCache* resourceCache_; //!< Resource cache
uint numComputeEngines_; //!< The number of available compute engines
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index ec219fb011..fd0008264b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -86,19 +86,19 @@ void Segment::copy(size_t offset, const void* src, size_t size) {
if (cpuAccess_ != nullptr) {
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
} else {
+ amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
- Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire();
+ Memory& xferBuf = gpu.xferWrite().Acquire(size);
size_t tmpSize = std::min(static_cast(xferBuf.size()), size);
size_t srcOffs = 0;
while (size != 0) {
- amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
xferBuf.hostWrite(&gpu, reinterpret_cast(src) + srcOffs, 0, tmpSize);
xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true);
size -= tmpSize;
srcOffs += tmpSize;
tmpSize = std::min(static_cast(xferBuf.size()), size);
}
- gpuAccess_->dev().xferWrite().release(gpu, xferBuf);
+ gpu.xferWrite().Release(xferBuf);
gpu.waitAllEngines();
}
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index 7c41dabc0a..7b69803c4e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -1046,8 +1046,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
uint viewFlags = 0;
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
- // Set the initial offset value for any resource to 0.
- // Note: Runtime can call create() more than once, if the initial memory type failed
+ // Set the initial offset value for any resource to 0.
+ // Note: Runtime can call create() more than once, if the initial memory type failed
offset_ = 0;
// This is a thread safe operation
@@ -1096,7 +1096,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
if (!desc_.buffer_) {
return CreateImage(params);
}
-
+
if (memoryType() == Pinned) {
return CreatePinned(params);
}
@@ -1112,6 +1112,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
offset_ += viewOwner_->offset();
if (viewOwner_->data() != nullptr) {
address_ = viewOwner_->data() + view->offset_;
+ mapCount_++;
}
memRef_ = viewOwner_->memRef_;
memRef_->retain();
@@ -1177,11 +1178,6 @@ void Resource::free()
return;
}
- // Sanity check for the map calls
- if ((mapCount_ != 0) && (memoryType() != Remote) &&
- (memoryType() != RemoteUSWC) && (memoryType() != Persistent)) {
- LogWarning("Resource wasn't unlocked, but destroyed!");
- }
const bool wait =
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
@@ -1206,7 +1202,7 @@ void Resource::free()
// Destroy PAL resource
if (iMem() != 0) {
- if (mapCount_ != 0) {
+ if (mapCount_ != 0 && wait) {
if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
//! @note: This is a workaround for bad applications that don't unmap memory
unmap(nullptr);
@@ -1738,6 +1734,7 @@ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers
address_ = reinterpret_cast(memRef_->cpuAddress_) + subOffset_;
} else {
address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+ address_ = reinterpret_cast(address_) + offset_;
}
if (address_ == nullptr) {
LogError("cal::ResMap failed!");
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 61a40def05..01c81917fa 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -542,23 +542,6 @@ bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint
return cbReady;
}
-void VirtualGPU::addXferWrite(Memory& memory) {
- if (xferWriteBuffers_.size() > 7) {
- dev().xferWrite().release(*this, *xferWriteBuffers_.front());
- xferWriteBuffers_.erase(xferWriteBuffers_.begin());
- }
-
- // Delay destruction
- xferWriteBuffers_.push_back(&memory);
-}
-
-void VirtualGPU::releaseXferWrite() {
- for (auto& memory : xferWriteBuffers_) {
- dev().xferWrite().release(*this, *memory);
- }
- xferWriteBuffers_.resize(0);
-}
-
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
@@ -718,7 +701,8 @@ VirtualGPU::VirtualGPU(Device& device)
printfDbgHSA_(nullptr),
tsCache_(nullptr),
dmaFlushMgmt_(device),
- writeBuffer_(nullptr),
+ managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
+ writeBuffer_(managedBuffer_, device.settings().stagedXferSize_),
hwRing_(0),
readjustTimeGPU_(0),
lastTS_(nullptr),
@@ -834,10 +818,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
Unimplemented();
}
- writeBuffer_ = new ManagedBuffer(*this, dev().settings().stagedXferSize_);
- if ((writeBuffer_ == nullptr) || !writeBuffer_->create(Resource::RemoteUSWC)) {
- // We failed to create a constant buffer
- delete writeBuffer_;
+ if (!managedBuffer_.create(Resource::RemoteUSWC)) {
return false;
}
@@ -963,7 +944,7 @@ VirtualGPU::~VirtualGPU() {
delete constBufs_[i];
}
- delete writeBuffer_;
+ managedBuffer_.release();
//! @todo Temporarily keep the buffer mapped for debug purpose
if (nullptr != schedParams_) {
@@ -2758,9 +2739,6 @@ bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
earlyDone &= isDone(&events[i]);
}
- // Release all transfer buffers on this command queue
- releaseXferWrite();
-
// Rlease all pinned memory
releasePinnedMem();
@@ -2813,14 +2791,14 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
}
bool VirtualGPU::allocConstantBuffers() {
- // Allocate constant buffers.
+ // Allocate constant buffers.
// Use double size, reported to the app to account for internal arguments
const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_;
uint i;
// Create/reallocate constant buffer resources
for (i = 0; i < MaxConstBuffersArguments; ++i) {
- ConstantBuffer* constBuf = new ConstantBuffer(*writeBuffer_, MinCbSize);
+ ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize);
if ((constBuf != nullptr) && constBuf->Create()) {
addConstBuffer(constBuf);
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 04b4facc1f..71ca26746c 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -375,8 +375,8 @@ class VirtualGPU : public device::VirtualDevice {
bool pfpaDoppCmd //!< is a submission for the pre-present primary
);
- //! Adds a stage write buffer into a list
- void addXferWrite(Memory& memory);
+ //! Return xfer buffer for staging operations
+ const XferBuffer& xferWrite() const { return writeBuffer_; }
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
@@ -518,9 +518,6 @@ class VirtualGPU : public device::VirtualDevice {
//! Allocates constant buffers
bool allocConstantBuffers();
- //! Releases stage write buffers
- void releaseXferWrite();
-
//! Allocate hsaQueueMem_
bool allocHsaQueueMem();
@@ -594,11 +591,11 @@ class VirtualGPU : public device::VirtualDevice {
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
- std::vector xferWriteBuffers_; //!< Stage write buffers
std::vector pinnedMems_; //!< Pinned memory list
- ManagedBuffer* writeBuffer_; //!< Managed write buffer
- constbufs_t constBufs_; //!< constant buffers
+ ManagedBuffer managedBuffer_; //!< Managed write buffer
+ constbufs_t constBufs_; //!< constant buffers
+ XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
typedef std::queue CommandBatchQueue;
CommandBatchQueue cbQueue_; //!< Queue of command batches