P4 to Git Change 1451494 by gandryey@gera-w8 on 2017/08/24 17:34:58

SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve
	- Remove some debug logic (resident_ field) from CL#1451293. It caused some CPU overhead
	- Use a template for waifForFence() to avoid some calls when unnecessary

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#29 edit
Tento commit je obsažen v:
foreman
2017-08-24 17:45:19 -04:00
rodič ca5f30aa01
revize caad6f5cce
5 změnil soubory, kde provedl 24 přidání a 27 odebrání
-1
Zobrazit soubor
@@ -170,7 +170,6 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
}
if (result) {
dev().addResource(memRef());
if (params != nullptr) {
memRef()->gpu_ = params->gpu_;
}
+6 -5
Zobrazit soubor
@@ -46,6 +46,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
}
// Update free memory size counters
const_cast<Device&>(dev).updateFreeMemory(createInfo.heaps[0], createInfo.size, false);
dev.addResource(memRef);
return memRef;
}
@@ -68,6 +69,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
}
// Update free memory size counters
const_cast<Device&>(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false);
dev.addResource(memRef);
return memRef;
}
@@ -90,6 +92,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
// Update free memory size counters
const_cast<Device&>(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size,
false);
dev.addResource(memRef);
return memRef;
}
@@ -111,6 +114,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
return nullptr;
}
}
dev.addResource(memRef);
return memRef;
}
@@ -137,11 +141,12 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
return nullptr;
}
}
dev.addResource(memRef);
return memRef;
}
GpuMemoryReference::GpuMemoryReference(const Device& dev)
: gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr), resident_(0) {}
: gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr) {}
GpuMemoryReference::~GpuMemoryReference() {
if (gpu_ == nullptr) {
@@ -162,10 +167,6 @@ GpuMemoryReference::~GpuMemoryReference() {
device_.vgpus()[0]->releaseMemory(this, &events_[0]);
}
if (resident_ != 0) {
LogError("Residency counter isn't 0 on memory destroy!");
}
{
amd::ScopedLock lk(device_.lockPAL());
if (cpuAddress_ != nullptr) {
-1
Zobrazit soubor
@@ -51,7 +51,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
//! @note: This field is necessary for the thread safe release only
VirtualGPU* gpu_; //!< Resource will be used only on this queue
std::vector<GpuEvent> events_; //!< GPU events associated with the resource
std::atomic<int> resident_; //!< Atomic counter for residency
protected:
//! Default destructor
+7 -16
Zobrazit soubor
@@ -163,7 +163,6 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
palSdiRefs_.push_back(iMem);
}
residency_size_ += iMem->Desc().size;
mem->resident_++;
}
}
@@ -172,7 +171,6 @@ void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
if (0 != memReferences_.erase(mem)) {
iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
residency_size_ -= iMem->Desc().size;
mem->resident_--;
}
}
@@ -222,13 +220,6 @@ bool VirtualGPU::Queue::flush() {
return false;
}
// Validate resources
for (auto it : memReferences_) {
if (it.second == cmdBufIdSlot_) {
assert(it.first->resident_ > 0 && "Unresident resource!");
}
}
if (palMemRefs_.size() != 0) {
if (Pal::Result::Success !=
iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
@@ -259,8 +250,10 @@ bool VirtualGPU::Queue::flush() {
LogError("PAL failed to submit CMD!");
return false;
}
// Make sure the slot isn't busy
constexpr bool IbReuse = true;
if (GPU_FLUSH_ON_EXECUTION) {
waifForFence(cmdBufIdSlot_);
waifForFence<!IbReuse>(cmdBufIdSlot_);
}
// Reset the counter of commands
@@ -271,7 +264,7 @@ bool VirtualGPU::Queue::flush() {
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
// Wait for the last one
waifForFence(cmdBufIdSlot_);
waifForFence<!IbReuse>(cmdBufIdSlot_);
cmdBufIdCurrent_ = 1;
cmbBufIdRetired_ = 0;
}
@@ -279,9 +272,7 @@ bool VirtualGPU::Queue::flush() {
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
// Make sure the slot isn't busy
constexpr bool IbReuse = true;
waifForFence(cmdBufIdSlot_, IbReuse);
waifForFence<IbReuse>(cmdBufIdSlot_);
// Progress retired TS
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
@@ -312,7 +303,6 @@ bool VirtualGPU::Queue::flush() {
if (it->second == cmdBufIdSlot_) {
palMems_.push_back(it->first->iMem());
residency_size_ -= it->first->iMem()->Desc().size;
it->first->resident_--;
it = memReferences_.erase(it);
} else {
++it;
@@ -333,7 +323,8 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
}
uint slotId = id % MaxCmdBuffers;
bool result = waifForFence(slotId);
constexpr bool IbReuse = true;
bool result = waifForFence<!IbReuse>(slotId);
cmbBufIdRetired_ = id;
return result;
}
+11 -4
Zobrazit soubor
@@ -96,15 +96,22 @@ class VirtualGPU : public device::VirtualDevice {
}
// ibReuse forces event wait without polling, to make sure event occured
bool waifForFence(uint cbId, bool ibReuse = false) const {
template <bool ibReuse>
bool waifForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
uint64_t start = amd::Os::timeNanos();
while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) {
uint64_t start;
uint64_t end;
if (!ibReuse) {
start = amd::Os::timeNanos();
}
while (ibReuse || (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus()))) {
if (result == Pal::Result::ErrorFenceNeverSubmitted) {
result = Pal::Result::Success;
break;
}
uint64_t end = amd::Os::timeNanos();
if (!ibReuse) {
end = amd::Os::timeNanos();
}
if (!ibReuse && ((end - start) < PollIntervalInNsec)) {
amd::Os::yield();
continue;