From dd4d5dcb9453399dddeeae42f09cd7f25fecec01 Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 26 Apr 2018 11:13:29 -0400 Subject: [PATCH] P4 to Git Change 1546657 by gandryey@gera-w8 on 2018/04/26 10:59:34 SWDEV-151739 - [CQE OCL][DTB][Perf][QR][DTB-BLOCKER][VEGA10] Upto 18% performance drop observed while running Video Composition test sub test of Compubench due to faulty CL#1544622 - Implement customized TS tracking for managed buffers. The common TS tracking mechanism saves the event of the last command, assuming SDMA and compute operations occur in order, but for managed buffers it's not the case. Also managed buffer doesn't have to validate TS for the parent resource. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#22 edit [ROCm/clr commit: 79ba5904dc09d96a186b72b001ade3de4f8f1ab8] --- .../clr/rocclr/runtime/device/pal/palblit.cpp | 2 +- .../rocclr/runtime/device/pal/palconstbuf.cpp | 74 +++++++++---------- .../rocclr/runtime/device/pal/palconstbuf.hpp | 18 +++-- .../rocclr/runtime/device/pal/palmemory.hpp | 1 + .../rocclr/runtime/device/pal/palresource.hpp | 1 + 5 files changed, 52 insertions(+), 44 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index e392c018f2..542028f300 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -267,7 +267,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M chunkSize = gpu().xferWrite().MaxSize(); } else { chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize()); - chunkSize = std::max(chunkSize, 128 * Ki); + chunkSize = std::max(chunkSize, 64 * Ki); bool flushDMA = true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp index c0dfd536e2..ccd6dfb583 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -11,32 +11,34 @@ namespace pal { // ================================================================================================ ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size) - : gpu_(gpu) - , buffers_(MaxNumberOfBuffers) - , activeBuffer_(0) - , size_(size) - , wrtOffset_(0) - , wrtAddress_(nullptr) {} + : gpu_(gpu) + , pool_(MaxNumberOfBuffers) + , activeBuffer_(0) + , size_(size) + , wrtOffset_(0) + , wrtAddress_(nullptr) {} // ================================================================================================ void ManagedBuffer::release() { - for (auto it : buffers_) { - if ((it != nullptr) && (it->data() != nullptr)) { - it->unmap(&gpu_); + for (auto it : pool_) { + if ((it.buf != nullptr) && (it.buf->data() != nullptr)) { + it.buf->unmap(&gpu_); } - delete it; + delete it.buf; } } // ================================================================================================ bool ManagedBuffer::create(Resource::MemoryType type) { - for (uint i = 0; i < buffers_.size(); ++i) { - buffers_[i] = new Memory(const_cast(gpu_.dev()), size_); - if (nullptr == buffers_[i] || !buffers_[i]->create(type)) { + for (uint i = 0; i < pool_.size(); ++i) { + pool_[i].buf = new Memory(const_cast(gpu_.dev()), size_); + if (nullptr == pool_[i].buf || !pool_[i].buf->create(type)) { LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); return false; } - void* wrtAddress = buffers_[i]->map(&gpu_); + // Assign virtual gpu to the allocation. Buffer will be used only on a particular queue + pool_[i].buf->memRef()->gpu_ = &gpu_; + void* wrtAddress = pool_[i].buf->map(&gpu_); if (wrtAddress == nullptr) { LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); return false; @@ -45,9 +47,9 @@ bool ManagedBuffer::create(Resource::MemoryType type) { uint dummy = 0; static constexpr bool Wait = true; // Write 0 for the buffer paging by VidMM - buffers_[i]->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait); + pool_[i].buf->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait); } - wrtAddress_ = buffers_[activeBuffer_]->data(); + wrtAddress_ = pool_[activeBuffer_].buf->data(); return true; } @@ -59,18 +61,22 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { // Align reserve size on the vector's boundary uint32_t count = amd::alignUp(size, MemAlignment); + // Save previous event + pinGpuEvent(); + // Check if buffer has enough space for reservation if ((wrtOffset_ + count) > size_) { // Get the next buffer in the list ++activeBuffer_; activeBuffer_ %= MaxNumberOfBuffers; // Make sure the buffer isn't busy - buffers_[activeBuffer_]->wait(gpu_); - wrtAddress_ = buffers_[activeBuffer_]->data(); + gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]); + gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]); + wrtAddress_ = pool_[activeBuffer_].buf->data(); wrtOffset_ = 0; } - *gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_; + *gpu_address = pool_[activeBuffer_].buf->vmAddress() + wrtOffset_; address cpu_address = wrtAddress_ + wrtOffset_; // Adjust the offset by the reserved size @@ -80,23 +86,17 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { } // ================================================================================================ -Memory& ManagedBuffer::reserveAtTheTop(uint32_t size) -{ - // Get the next buffer in the list - ++activeBuffer_; - activeBuffer_ %= MaxNumberOfBuffers; - // Make sure the buffer isn't busy - buffers_[activeBuffer_]->wait(gpu_); - wrtAddress_ = buffers_[activeBuffer_]->data(); - wrtOffset_ = 0; - return *buffers_[activeBuffer_]; +void ManagedBuffer::pinGpuEvent() { + GpuEvent* event = activeMemory()->getGpuEvent(gpu()); + pool_[activeBuffer_].events[event->engineId_] = *event; + activeMemory()->setBusy(gpu(), GpuEvent::InvalidID); } // ================================================================================================ ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size) - : mbuf_(mbuf) - , sys_mem_copy_(nullptr) - , size_(size) + : mbuf_(mbuf) + , sys_mem_copy_(nullptr) + , size_(size) {} // ================================================================================================ @@ -127,11 +127,11 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const { // ================================================================================================ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const { - uint64_t vm_address; - address cpu_address = mbuf_.reserve(size, &vm_address); - // Update memory with new CB data - memcpy(cpu_address, sysmem, size); - return vm_address; + uint64_t vm_address; + address cpu_address = mbuf_.reserve(size, &vm_address); + // Update memory with new CB data + memcpy(cpu_address, sysmem, size); + return vm_address; } // ================================================================================================ diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp index da1984f636..c1853b0537 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp @@ -30,9 +30,6 @@ class ManagedBuffer : public amd::EmbeddedObject { address reserve(uint32_t size, //!< real data size for upload uint64_t* gpu_address); - //! Reserves memory at the top of the active buffer - Memory& reserveAtTheTop(uint32_t size); - //! Returns CB size uint32_t size() const { return size_; } @@ -40,14 +37,23 @@ class ManagedBuffer : public amd::EmbeddedObject { uint32_t wrtOffset() const { return wrtOffset_; } //! Returns active GPU buffer - Memory* activeMemory() const { return buffers_[activeBuffer_]; } + Memory* activeMemory() const { return pool_[activeBuffer_].buf; } - uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); } + //! Retruns VM address for the active buffer + uint64_t vmAddress() const { return pool_[activeBuffer_].buf->vmAddress(); } + + //! Update the timestamp for the HW operation + void pinGpuEvent(); //! Returns VirtualGPU object this managed resource associated VirtualGPU& gpu() const { return gpu_; } private: + struct TimeStampedBuffer { + Memory* buf; + GpuEvent events[AllEngines]; + }; + //! The maximum number of the managed buffers static constexpr uint32_t MaxNumberOfBuffers = 3; @@ -58,7 +64,7 @@ class ManagedBuffer : public amd::EmbeddedObject { ManagedBuffer& operator=(const ManagedBuffer&) = delete; VirtualGPU& gpu_; //!< Virtual GPU object - std::vector buffers_; //!< Buffers for management + std::vector pool_; //!< Buffers for management uint32_t activeBuffer_; //!< Current active buffer uint32_t size_; //!< Constant buffer size uint32_t wrtOffset_; //!< Current write offset diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp index 6825c8b8c5..fbb95bcc80 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp @@ -145,6 +145,7 @@ class Memory : public device::Memory, public Resource { //! Quick view update for managed buffers. It should avoid expensive object allocations void updateView(Resource* view, size_t offset, size_t size) { size_ = size; + flags_ |= HostMemoryDirectAccess; Resource::updateView(view, offset, size); } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp index a838f2673a..3ae73d7ec6 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -378,6 +378,7 @@ class Resource : public amd::HeapObject { memRef_->retain(); desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + setBusy(*memRef()->gpu_, GpuEvent::InvalidID); } }