From caad6f5ccec769d6ea279a8daeff97a8189f329e Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 24 Aug 2017 17:45:19 -0400 Subject: [PATCH] P4 to Git Change 1451494 by gandryey@gera-w8 on 2017/08/24 17:34:58 SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve - Remove some debug logic (resident_ field) from CL#1451293. It caused some CPU overhead - Use a template for waifForFence() to avoid some calls when unnecessary Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#53 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#29 edit --- rocclr/runtime/device/pal/palmemory.cpp | 1 - rocclr/runtime/device/pal/palresource.cpp | 11 ++++++----- rocclr/runtime/device/pal/palresource.hpp | 1 - rocclr/runtime/device/pal/palvirtual.cpp | 23 +++++++---------------- rocclr/runtime/device/pal/palvirtual.hpp | 15 +++++++++++---- 5 files changed, 24 insertions(+), 27 deletions(-) diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp index f417e072fc..6522677163 100644 --- a/rocclr/runtime/device/pal/palmemory.cpp +++ b/rocclr/runtime/device/pal/palmemory.cpp @@ -170,7 +170,6 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params } if (result) { - dev().addResource(memRef()); if (params != nullptr) { memRef()->gpu_ = params->gpu_; } diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp index 9c75bd3553..9448a1825e 100644 --- a/rocclr/runtime/device/pal/palresource.cpp +++ b/rocclr/runtime/device/pal/palresource.cpp @@ -46,6 +46,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, } // Update free memory size counters const_cast(dev).updateFreeMemory(createInfo.heaps[0], createInfo.size, false); + dev.addResource(memRef); return memRef; } @@ -68,6 +69,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, } // Update free memory size counters const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); + dev.addResource(memRef); return memRef; } @@ -90,6 +92,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, // Update free memory size counters const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); + dev.addResource(memRef); return memRef; } @@ -111,6 +114,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, return nullptr; } } + dev.addResource(memRef); return memRef; } @@ -137,11 +141,12 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, return nullptr; } } + dev.addResource(memRef); return memRef; } GpuMemoryReference::GpuMemoryReference(const Device& dev) - : gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr), resident_(0) {} + : gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr) {} GpuMemoryReference::~GpuMemoryReference() { if (gpu_ == nullptr) { @@ -162,10 +167,6 @@ GpuMemoryReference::~GpuMemoryReference() { device_.vgpus()[0]->releaseMemory(this, &events_[0]); } - if (resident_ != 0) { - LogError("Residency counter isn't 0 on memory destroy!"); - } - { amd::ScopedLock lk(device_.lockPAL()); if (cpuAddress_ != nullptr) { diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp index 36156683af..e7f91440fd 100644 --- a/rocclr/runtime/device/pal/palresource.hpp +++ b/rocclr/runtime/device/pal/palresource.hpp @@ -51,7 +51,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject { //! @note: This field is necessary for the thread safe release only VirtualGPU* gpu_; //!< Resource will be used only on this queue std::vector events_; //!< GPU events associated with the resource - std::atomic resident_; //!< Atomic counter for residency protected: //! Default destructor diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index aea220fe4e..29d2bd0c63 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -163,7 +163,6 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) { palSdiRefs_.push_back(iMem); } residency_size_ += iMem->Desc().size; - mem->resident_++; } } @@ -172,7 +171,6 @@ void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) { if (0 != memReferences_.erase(mem)) { iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); residency_size_ -= iMem->Desc().size; - mem->resident_--; } } @@ -222,13 +220,6 @@ bool VirtualGPU::Queue::flush() { return false; } - // Validate resources - for (auto it : memReferences_) { - if (it.second == cmdBufIdSlot_) { - assert(it.first->resident_ > 0 && "Unresident resource!"); - } - } - if (palMemRefs_.size() != 0) { if (Pal::Result::Success != iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_, @@ -259,8 +250,10 @@ bool VirtualGPU::Queue::flush() { LogError("PAL failed to submit CMD!"); return false; } + // Make sure the slot isn't busy + constexpr bool IbReuse = true; if (GPU_FLUSH_ON_EXECUTION) { - waifForFence(cmdBufIdSlot_); + waifForFence(cmdBufIdSlot_); } // Reset the counter of commands @@ -271,7 +264,7 @@ bool VirtualGPU::Queue::flush() { if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { // Wait for the last one - waifForFence(cmdBufIdSlot_); + waifForFence(cmdBufIdSlot_); cmdBufIdCurrent_ = 1; cmbBufIdRetired_ = 0; } @@ -279,9 +272,7 @@ bool VirtualGPU::Queue::flush() { // Wrap current slot cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers; - // Make sure the slot isn't busy - constexpr bool IbReuse = true; - waifForFence(cmdBufIdSlot_, IbReuse); + waifForFence(cmdBufIdSlot_); // Progress retired TS if ((cmdBufIdCurrent_ > MaxCmdBuffers) && @@ -312,7 +303,6 @@ bool VirtualGPU::Queue::flush() { if (it->second == cmdBufIdSlot_) { palMems_.push_back(it->first->iMem()); residency_size_ -= it->first->iMem()->Desc().size; - it->first->resident_--; it = memReferences_.erase(it); } else { ++it; @@ -333,7 +323,8 @@ bool VirtualGPU::Queue::waitForEvent(uint id) { } uint slotId = id % MaxCmdBuffers; - bool result = waifForFence(slotId); + constexpr bool IbReuse = true; + bool result = waifForFence(slotId); cmbBufIdRetired_ = id; return result; } diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 19f93202da..15c375a23b 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -96,15 +96,22 @@ class VirtualGPU : public device::VirtualDevice { } // ibReuse forces event wait without polling, to make sure event occured - bool waifForFence(uint cbId, bool ibReuse = false) const { + template + bool waifForFence(uint cbId) const { Pal::Result result = Pal::Result::Success; - uint64_t start = amd::Os::timeNanos(); - while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) { + uint64_t start; + uint64_t end; + if (!ibReuse) { + start = amd::Os::timeNanos(); + } + while (ibReuse || (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus()))) { if (result == Pal::Result::ErrorFenceNeverSubmitted) { result = Pal::Result::Success; break; } - uint64_t end = amd::Os::timeNanos(); + if (!ibReuse) { + end = amd::Os::timeNanos(); + } if (!ibReuse && ((end - start) < PollIntervalInNsec)) { amd::Os::yield(); continue;