From 141d36d8491dd9cdecb0c2f548b8dd51ff13f898 Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 20 Apr 2018 17:08:29 -0400 Subject: [PATCH] P4 to Git Change 1544622 by gandryey@gera-w8 on 2018/04/20 17:02:52 SWDEV-79445 - OCL generic changes and code clean-up - Add managed buffer support and replace all uploads with the managed buffer allocations - Add staging copy for small image writes - Replace constant buffer in FillBuffer with a managed buffer also Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#84 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#26 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#62 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#92 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#48 edit [ROCm/clr commit: 392724cc3f85cee05249e622e34564e51706e340] --- .../clr/rocclr/runtime/device/pal/palblit.cpp | 113 ++++++++---------- .../clr/rocclr/runtime/device/pal/palblit.hpp | 5 +- .../rocclr/runtime/device/pal/palconstbuf.cpp | 59 ++++++++- .../rocclr/runtime/device/pal/palconstbuf.hpp | 57 +++++++-- .../rocclr/runtime/device/pal/paldevice.cpp | 17 --- .../rocclr/runtime/device/pal/paldevice.hpp | 4 - .../rocclr/runtime/device/pal/palprogram.cpp | 6 +- .../rocclr/runtime/device/pal/palresource.cpp | 15 +-- .../rocclr/runtime/device/pal/palvirtual.cpp | 34 +----- .../rocclr/runtime/device/pal/palvirtual.hpp | 13 +- 10 files changed, 175 insertions(+), 148 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index c32a6aa687..e392c018f2 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -256,34 +256,36 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const a bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf, size_t origin, size_t& offset, size_t& totalSize, size_t xferSize) const { - amd::Coord3D src(0, 0, 0); size_t chunkSize; static const bool CopyRect = false; // Flush DMA for ASYNC copy // @todo Blocking write requires a flush to start earlier, // but currently VDI doesn't provide that info - static const bool FlushDMA = false; + bool flushDMA = false; - if (dev().xferRead().bufSize() < 128 * Ki) { - chunkSize = dev().xferWrite().bufSize(); + if (gpu().xferWrite().MaxSize() < 128 * Ki) { + chunkSize = gpu().xferWrite().MaxSize(); } else { - chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferWrite().bufSize()); + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize()); chunkSize = std::max(chunkSize, 128 * Ki); + bool flushDMA = true; } while (xferSize != 0) { // Find the partial transfer size size_t tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D src(offset, 0, 0); amd::Coord3D dst(origin + offset, 0, 0); amd::Coord3D copySize(tmpSize, 0, 0); // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize)) { + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, + src, copySize, Resource::NoWait)) { return false; } // Copy data into the original destination memory - if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) { + if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, flushDMA)) { return false; } @@ -365,7 +367,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, } if (dstSize != 0) { - Memory& xferBuf = dev().xferWrite().acquire(); + Memory& xferBuf = gpu().xferWrite().Acquire(dstSize); // Write memory using a staged resource if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, @@ -374,7 +376,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, return false; } - gpu().addXferWrite(xferBuf); + gpu().xferWrite().Release(xferBuf); } } @@ -392,7 +394,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem gpuMem(dstMemory).isPersistentDirectMap()) { return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); } else { - Memory& xferBuf = dev().xferWrite().acquire(); + Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0])); amd::Coord3D src(0, 0, 0); size_t tmpSize = 0; @@ -408,7 +410,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem while (dstSize != 0) { // Find the partial transfer size - tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); + tmpSize = std::min(gpu().xferWrite().MaxSize(), dstSize); amd::Coord3D dst(bufOffset, 0, 0); amd::Coord3D copySize(tmpSize, 0, 0); @@ -432,7 +434,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem } } } - gpu().addXferWrite(xferBuf); + gpu().xferWrite().Release(xferBuf); } return true; @@ -576,8 +578,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory entire, rowPitch, slicePitch); } else { // Use PAL path for a transfer - result = - gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, + size, gpuMem(dstMemory)); // Check if a HostBlit transfer is required if (completeOperation_ && !result) { @@ -607,9 +609,8 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), program_(NULL), - constantBuffer_(NULL), xferBufferSize_(0), - lockXferOps_(NULL) { + lockXferOps_("Transfer Ops Lock", true) { for (uint i = 0; i < BlitTotal; ++i) { kernels_[i] = NULL; } @@ -636,17 +637,11 @@ KernelBlitManager::~KernelBlitManager() { context_->release(); } - if (NULL != constantBuffer_) { - constantBuffer_->release(); - } - for (uint i = 0; i < MaxXferBuffers; ++i) { if (NULL != xferBuffers_[i]) { xferBuffers_[i]->release(); } } - - delete lockXferOps_; } bool KernelBlitManager::create(amd::Device& device) { @@ -693,19 +688,6 @@ bool KernelBlitManager::createProgram(Device& device) { result = true; } while (!result); - // Create an internal constant buffer - constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); - - if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { - constantBuffer_->release(); - constantBuffer_ = NULL; - return false; - } else if (constantBuffer_ == NULL) { - return false; - } - - // Assign the constant buffer to the current virtual GPU - constantBuffer_->setVirtualDevice(&gpu()); if (dev().settings().xferBufSize_ > 0) { xferBufferSize_ = dev().settings().xferBufSize_; @@ -734,11 +716,6 @@ bool KernelBlitManager::createProgram(Device& device) { } } - lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); - if (NULL == lockXferOps_) { - return false; - } - return result; } @@ -1685,30 +1662,43 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor } else { size_t pinSize; FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); + size_t partial = 0; + bool pinned; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, - entire); - synchronize(); - return result; + amd::Memory* amdMemory = nullptr; + Memory* srcMemory; + if (pinSize > gpu().xferWrite().MaxSize()) { + amdMemory = pinHostMemory(srcHost, pinSize, partial); + if (amdMemory == nullptr) { + // Force SW copy + result = HostBlitManager::writeImage(srcHost, dstMemory, + origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + // Get device memory for this virtual device + srcMemory = dev().getGpuMemory(amdMemory); + pinned = true; + } + else { + srcMemory = &gpu().xferWrite().Acquire(pinSize); + srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait); + pinned = false; } // Readjust destination offset const amd::Coord3D srcOrigin(partial); - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - // Copy image to buffer result = copyBufferToImage(*srcMemory, dstMemory, srcOrigin, origin, size, entire, rowPitch, slicePitch); - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); + if (pinned) { + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + gpu().xferWrite().Release(*srcMemory); + } } synchronize(); @@ -2054,14 +2044,12 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); } - Memory* gpuCB = dev().getGpuMemory(constantBuffer_); - if (gpuCB == NULL) { - return false; - } - void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); + Memory& gpuCB = gpu().xferWrite().Acquire(patternSize); + void* constBuf = gpuCB.map(&gpu(), Resource::NoWait); memcpy(constBuf, pattern, patternSize); - gpuCB->unmap(&gpu()); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); + gpuCB.unmap(&gpu()); + Memory* pGpuCB = &gpuCB; + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &pGpuCB); cl_ulong offset = origin[0]; if (dwordAligned) { patternSize /= sizeof(uint32_t); @@ -2077,6 +2065,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, // Execute the blit address parameters = kernels_[fillType]->parameters().values(); result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); + gpu().xferWrite().Release(gpuCB); } synchronize(); @@ -2137,12 +2126,10 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); // Program source origin cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; - ; setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); // Program destinaiton origin cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; - ; setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); cl_ulong copySize = size[0]; diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp index ece29da86d..fe52ac2a59 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp @@ -352,7 +352,7 @@ class KernelBlitManager : public DmaBlitManager { const void* data //!< Raw data pointer ) const; - virtual amd::Monitor* lockXfer() const { return lockXferOps_; } + virtual amd::Monitor* lockXfer() const { return &lockXferOps_; } private: static const size_t MaxXferBuffers = 2; @@ -397,10 +397,9 @@ class KernelBlitManager : public DmaBlitManager { amd::Program* program_; //!< GPU program obejct amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit - amd::Memory* constantBuffer_; //!< An internal CB for blits amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images size_t xferBufferSize_; //!< Transfer buffer size - amd::Monitor* lockXferOps_; //!< Lock transfer operation + mutable amd::Monitor lockXferOps_; //!< Lock transfer operation }; static const char* BlitName[KernelBlitManager::BlitTotal] = { diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp index b5a266fe43..bffa902e0a 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -19,7 +19,7 @@ ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size) , wrtAddress_(nullptr) {} // ================================================================================================ -ManagedBuffer::~ManagedBuffer() { +void ManagedBuffer::release() { for (auto it : buffers_) { if (it->data() != nullptr) { it->unmap(&gpu_); @@ -72,13 +72,26 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { *gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_; address cpu_address = wrtAddress_ + wrtOffset_; - + // Adjust the offset by the reserved size wrtOffset_ += count; return cpu_address; } +// ================================================================================================ +Memory& ManagedBuffer::reserveAtTheTop(uint32_t size) +{ + // Get the next buffer in the list + ++activeBuffer_; + activeBuffer_ %= MaxNumberOfBuffers; + // Make sure the buffer isn't busy + buffers_[activeBuffer_]->wait(gpu_); + wrtAddress_ = buffers_[activeBuffer_]->data(); + wrtOffset_ = 0; + return *buffers_[activeBuffer_]; +} + // ================================================================================================ ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size) : mbuf_(mbuf) @@ -114,11 +127,47 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const { // ================================================================================================ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const { + uint64_t vm_address; + address cpu_address = mbuf_.reserve(size, &vm_address); + // Update memory with new CB data + memcpy(cpu_address, sysmem, size); + return vm_address; +} + +// ================================================================================================ +XferBuffer::XferBuffer(ManagedBuffer& mbuf, uint32_t size) + : mbuf_(mbuf) + , size_(size) +{} + +// ================================================================================================ +Memory& XferBuffer::Acquire(uint32_t size) const +{ uint64_t vm_address; + // Reserve space in the managed buffer address cpu_address = mbuf_.reserve(size, &vm_address); - // Update memory with new CB data - memcpy(cpu_address, sysmem, size); - return vm_address; + // Create a view for access + Memory* mem = new Memory(mbuf_.gpu().dev(), static_cast(size)); + Resource::ViewParams params = {}; + params.gpu_ = &mbuf_.gpu(); + params.offset_ = vm_address - mbuf_.vmAddress(); + params.size_ = size; + params.resource_ = mbuf_.activeMemory(); + if (nullptr == mem || !mem->create(Resource::View, ¶ms)) { + delete mem; + // If the suballocaiton failed for some reason, then return the top of the active buffer + return mbuf_.reserveAtTheTop(size); + } + return *mem; +} + +// ================================================================================================ +void XferBuffer::Release(Memory& mem) const +{ + // Delete view + if (mem.desc().type_ == Resource::View) { + delete &mem; + } } } // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp index 5b6cb2af1b..5ab7d5d67f 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp @@ -9,19 +9,20 @@ namespace pal { //! Managed buffer (staging or constant) -class ManagedBuffer : public amd::HeapObject { +class ManagedBuffer : public amd::EmbeddedObject { public: //! Constructor for the ConstBuffer class ManagedBuffer(VirtualGPU& gpu, //!< Virtual GPU device object uint32_t size //!< size of the managed buffers in bytes ); + ~ManagedBuffer() {} - //! Destructor for the ConstBuffer class - ~ManagedBuffer(); - - //! Creates the real HW constant buffer + //! Creates the managed buffers bool create(Resource::MemoryType type); + //! Release the managed buffers + void release(); + /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW * * \return True if the data upload was succesful @@ -29,6 +30,9 @@ class ManagedBuffer : public amd::HeapObject { address reserve(uint32_t size, //!< real data size for upload uint64_t* gpu_address); + //! Reserves memory at the top of the active buffer + Memory& reserveAtTheTop(uint32_t size); + //! Returns CB size uint32_t size() const { return size_; } @@ -40,6 +44,9 @@ class ManagedBuffer : public amd::HeapObject { uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); } + //! Returns VirtualGPU object this managed resource associated + VirtualGPU& gpu() const { return gpu_; } + private: //! The maximum number of the managed buffers static constexpr uint32_t MaxNumberOfBuffers = 3; @@ -63,13 +70,13 @@ class ConstantBuffer : public amd::HeapObject { public: //! Constructor for the ConstBuffer class ConstantBuffer(ManagedBuffer& mbuf, //!< Managed buffer - uint32_t size + uint32_t size //!< Max size of the constant buffer ); //! Destructor for the ConstBuffer class ~ConstantBuffer(); - //! Creates the real HW constant buffer + //! Creates the HW constant buffer bool Create(); /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW @@ -88,7 +95,7 @@ public: ) const; //! Returns a pointer to the system memory copy for CB - address SysMemCopy(uint32_t size = 0) const { return sys_mem_copy_; } + address SysMemCopy() const { return sys_mem_copy_; } //! Returns active GPU buffer Memory* ActiveMemory() const { return mbuf_.activeMemory(); } @@ -105,4 +112,38 @@ private: uint32_t size_; //!< Constant buffer size }; +//! Staging buffer +class XferBuffer : public amd::EmbeddedObject { +public: + //! Constructor for the ConstBuffer class + XferBuffer(ManagedBuffer& mbuf, //!< Managed buffer + uint32_t size //!< Maximum size of the transfer buffer + ); + + //! Destructor for the ConstBuffer class + ~XferBuffer() {} + + /*! \brief Acquires free memory from the managed buffer + * + * \return GPU memory object associated with free memory + */ + Memory& Acquire(uint32_t size //!< data size for transfers + ) const; + + //! Releases memory object used in the staging transfer + void Release(Memory& mem //!< Memory object for release + ) const; + + size_t MaxSize() const { return static_cast(size_); } + +private: + //! Disable copy constructor + XferBuffer(const XferBuffer&) = delete; + + //! Disable operator= + XferBuffer& operator=(const XferBuffer&) = delete; + + ManagedBuffer& mbuf_; //!< Managed buffer on GPU + uint32_t size_; //!< Mx staging buffer size +}; /*@}*/} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index 7d25d22ae6..61bdd6e91c 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -681,7 +681,6 @@ Device::Device() scratchAlloc_(nullptr), mapCacheOps_(nullptr), xferRead_(nullptr), - xferWrite_(nullptr), mapCache_(nullptr), resourceCache_(nullptr), numComputeEngines_(0), @@ -732,7 +731,6 @@ Device::~Device() { // Destroy temporary buffers for read/write delete xferRead_; - delete xferWrite_; // Destroy resource cache delete resourceCache_; @@ -986,21 +984,6 @@ bool Device::initializeHeapResources() { } if (settings().stagedXferSize_ != 0) { - // Initialize staged write buffers - if (settings().stagedXferWrite_) { - Resource::MemoryType type; - if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { - type = Resource::Persistent; - } else { - type = Resource::RemoteUSWC; - } - xferWrite_ = new XferBuffers(*this, type, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferWrite_ == nullptr) || !xferWrite_->create()) { - LogError("Couldn't allocate transfer buffer objects for read"); - return false; - } - } - // Initialize staged read buffers if (settings().stagedXferRead_) { xferRead_ = new XferBuffers(*this, Resource::Remote, diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp index da4dbdcae9..0d622dae45 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -396,9 +396,6 @@ class Device : public NullDevice { pal::Memory* createScratchBuffer(size_t size //!< Size of buffer ) const; - //! Returns transfer buffer object - XferBuffers& xferWrite() const { return *xferWrite_; } - //! Returns transfer buffer object XferBuffers& xferRead() const { return *xferRead_; } @@ -588,7 +585,6 @@ class Device : public NullDevice { amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources XferBuffers* xferRead_; //!< Transfer buffers read - XferBuffers* xferWrite_; //!< Transfer buffers write std::vector* mapCache_; //!< Map cache info structure ResourceCache* resourceCache_; //!< Resource cache uint numComputeEngines_; //!< The number of available compute engines diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index ec219fb011..fd0008264b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -86,19 +86,19 @@ void Segment::copy(size_t offset, const void* src, size_t size) { if (cpuAccess_ != nullptr) { amd::Os::fastMemcpy(cpuAddress(offset), src, size); } else { + amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer()); VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); - Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire(); + Memory& xferBuf = gpu.xferWrite().Acquire(size); size_t tmpSize = std::min(static_cast(xferBuf.size()), size); size_t srcOffs = 0; while (size != 0) { - amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer()); xferBuf.hostWrite(&gpu, reinterpret_cast(src) + srcOffs, 0, tmpSize); xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true); size -= tmpSize; srcOffs += tmpSize; tmpSize = std::min(static_cast(xferBuf.size()), size); } - gpuAccess_->dev().xferWrite().release(gpu, xferBuf); + gpu.xferWrite().Release(xferBuf); gpu.waitAllEngines(); } } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index 7c41dabc0a..7b69803c4e 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -1046,8 +1046,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) { uint viewFlags = 0; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); - // Set the initial offset value for any resource to 0. - // Note: Runtime can call create() more than once, if the initial memory type failed + // Set the initial offset value for any resource to 0. + // Note: Runtime can call create() more than once, if the initial memory type failed offset_ = 0; // This is a thread safe operation @@ -1096,7 +1096,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) { if (!desc_.buffer_) { return CreateImage(params); } - + if (memoryType() == Pinned) { return CreatePinned(params); } @@ -1112,6 +1112,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) { offset_ += viewOwner_->offset(); if (viewOwner_->data() != nullptr) { address_ = viewOwner_->data() + view->offset_; + mapCount_++; } memRef_ = viewOwner_->memRef_; memRef_->retain(); @@ -1177,11 +1178,6 @@ void Resource::free() return; } - // Sanity check for the map calls - if ((mapCount_ != 0) && (memoryType() != Remote) && - (memoryType() != RemoteUSWC) && (memoryType() != Persistent)) { - LogWarning("Resource wasn't unlocked, but destroyed!"); - } const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View); @@ -1206,7 +1202,7 @@ void Resource::free() // Destroy PAL resource if (iMem() != 0) { - if (mapCount_ != 0) { + if (mapCount_ != 0 && wait) { if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) { //! @note: This is a workaround for bad applications that don't unmap memory unmap(nullptr); @@ -1738,6 +1734,7 @@ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers address_ = reinterpret_cast(memRef_->cpuAddress_) + subOffset_; } else { address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem()); + address_ = reinterpret_cast
(address_) + offset_; } if (address_ == nullptr) { LogError("cal::ResMap failed!"); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 61a40def05..01c81917fa 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -542,23 +542,6 @@ bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint return cbReady; } -void VirtualGPU::addXferWrite(Memory& memory) { - if (xferWriteBuffers_.size() > 7) { - dev().xferWrite().release(*this, *xferWriteBuffers_.front()); - xferWriteBuffers_.erase(xferWriteBuffers_.begin()); - } - - // Delay destruction - xferWriteBuffers_.push_back(&memory); -} - -void VirtualGPU::releaseXferWrite() { - for (auto& memory : xferWriteBuffers_) { - dev().xferWrite().release(*this, *memory); - } - xferWriteBuffers_.resize(0); -} - void VirtualGPU::addPinnedMem(amd::Memory* mem) { if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { @@ -718,7 +701,8 @@ VirtualGPU::VirtualGPU(Device& device) printfDbgHSA_(nullptr), tsCache_(nullptr), dmaFlushMgmt_(device), - writeBuffer_(nullptr), + managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki), + writeBuffer_(managedBuffer_, device.settings().stagedXferSize_), hwRing_(0), readjustTimeGPU_(0), lastTS_(nullptr), @@ -834,10 +818,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, Unimplemented(); } - writeBuffer_ = new ManagedBuffer(*this, dev().settings().stagedXferSize_); - if ((writeBuffer_ == nullptr) || !writeBuffer_->create(Resource::RemoteUSWC)) { - // We failed to create a constant buffer - delete writeBuffer_; + if (!managedBuffer_.create(Resource::RemoteUSWC)) { return false; } @@ -963,7 +944,7 @@ VirtualGPU::~VirtualGPU() { delete constBufs_[i]; } - delete writeBuffer_; + managedBuffer_.release(); //! @todo Temporarily keep the buffer mapped for debug purpose if (nullptr != schedParams_) { @@ -2758,9 +2739,6 @@ bool VirtualGPU::waitAllEngines(CommandBatch* cb) { earlyDone &= isDone(&events[i]); } - // Release all transfer buffers on this command queue - releaseXferWrite(); - // Rlease all pinned memory releasePinnedMem(); @@ -2813,14 +2791,14 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { } bool VirtualGPU::allocConstantBuffers() { - // Allocate constant buffers. + // Allocate constant buffers. // Use double size, reported to the app to account for internal arguments const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_; uint i; // Create/reallocate constant buffer resources for (i = 0; i < MaxConstBuffersArguments; ++i) { - ConstantBuffer* constBuf = new ConstantBuffer(*writeBuffer_, MinCbSize); + ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize); if ((constBuf != nullptr) && constBuf->Create()) { addConstBuffer(constBuf); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 04b4facc1f..71ca26746c 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -375,8 +375,8 @@ class VirtualGPU : public device::VirtualDevice { bool pfpaDoppCmd //!< is a submission for the pre-present primary ); - //! Adds a stage write buffer into a list - void addXferWrite(Memory& memory); + //! Return xfer buffer for staging operations + const XferBuffer& xferWrite() const { return writeBuffer_; } //! Adds a pinned memory object into a map void addPinnedMem(amd::Memory* mem); @@ -518,9 +518,6 @@ class VirtualGPU : public device::VirtualDevice { //! Allocates constant buffers bool allocConstantBuffers(); - //! Releases stage write buffers - void releaseXferWrite(); - //! Allocate hsaQueueMem_ bool allocHsaQueueMem(); @@ -594,11 +591,11 @@ class VirtualGPU : public device::VirtualDevice { DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management - std::vector xferWriteBuffers_; //!< Stage write buffers std::vector pinnedMems_; //!< Pinned memory list - ManagedBuffer* writeBuffer_; //!< Managed write buffer - constbufs_t constBufs_; //!< constant buffers + ManagedBuffer managedBuffer_; //!< Managed write buffer + constbufs_t constBufs_; //!< constant buffers + XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads typedef std::queue CommandBatchQueue; CommandBatchQueue cbQueue_; //!< Queue of command batches