P4 to Git Change 1451494 by gandryey@gera-w8 on 2017/08/24 17:34:58
SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve - Remove some debug logic (resident_ field) from CL#1451293. It caused some CPU overhead - Use a template for waifForFence() to avoid some calls when unnecessary Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#53 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#29 edit
Tento commit je obsažen v:
@@ -170,7 +170,6 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
|
||||
}
|
||||
|
||||
if (result) {
|
||||
dev().addResource(memRef());
|
||||
if (params != nullptr) {
|
||||
memRef()->gpu_ = params->gpu_;
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
}
|
||||
// Update free memory size counters
|
||||
const_cast<Device&>(dev).updateFreeMemory(createInfo.heaps[0], createInfo.size, false);
|
||||
dev.addResource(memRef);
|
||||
return memRef;
|
||||
}
|
||||
|
||||
@@ -68,6 +69,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
}
|
||||
// Update free memory size counters
|
||||
const_cast<Device&>(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false);
|
||||
dev.addResource(memRef);
|
||||
return memRef;
|
||||
}
|
||||
|
||||
@@ -90,6 +92,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
// Update free memory size counters
|
||||
const_cast<Device&>(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size,
|
||||
false);
|
||||
dev.addResource(memRef);
|
||||
return memRef;
|
||||
}
|
||||
|
||||
@@ -111,6 +114,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
dev.addResource(memRef);
|
||||
return memRef;
|
||||
}
|
||||
|
||||
@@ -137,11 +141,12 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
dev.addResource(memRef);
|
||||
return memRef;
|
||||
}
|
||||
|
||||
GpuMemoryReference::GpuMemoryReference(const Device& dev)
|
||||
: gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr), resident_(0) {}
|
||||
: gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr) {}
|
||||
|
||||
GpuMemoryReference::~GpuMemoryReference() {
|
||||
if (gpu_ == nullptr) {
|
||||
@@ -162,10 +167,6 @@ GpuMemoryReference::~GpuMemoryReference() {
|
||||
device_.vgpus()[0]->releaseMemory(this, &events_[0]);
|
||||
}
|
||||
|
||||
if (resident_ != 0) {
|
||||
LogError("Residency counter isn't 0 on memory destroy!");
|
||||
}
|
||||
|
||||
{
|
||||
amd::ScopedLock lk(device_.lockPAL());
|
||||
if (cpuAddress_ != nullptr) {
|
||||
|
||||
@@ -51,7 +51,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
|
||||
//! @note: This field is necessary for the thread safe release only
|
||||
VirtualGPU* gpu_; //!< Resource will be used only on this queue
|
||||
std::vector<GpuEvent> events_; //!< GPU events associated with the resource
|
||||
std::atomic<int> resident_; //!< Atomic counter for residency
|
||||
|
||||
protected:
|
||||
//! Default destructor
|
||||
|
||||
@@ -163,7 +163,6 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
|
||||
palSdiRefs_.push_back(iMem);
|
||||
}
|
||||
residency_size_ += iMem->Desc().size;
|
||||
mem->resident_++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,7 +171,6 @@ void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
|
||||
if (0 != memReferences_.erase(mem)) {
|
||||
iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
|
||||
residency_size_ -= iMem->Desc().size;
|
||||
mem->resident_--;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -222,13 +220,6 @@ bool VirtualGPU::Queue::flush() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate resources
|
||||
for (auto it : memReferences_) {
|
||||
if (it.second == cmdBufIdSlot_) {
|
||||
assert(it.first->resident_ > 0 && "Unresident resource!");
|
||||
}
|
||||
}
|
||||
|
||||
if (palMemRefs_.size() != 0) {
|
||||
if (Pal::Result::Success !=
|
||||
iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
|
||||
@@ -259,8 +250,10 @@ bool VirtualGPU::Queue::flush() {
|
||||
LogError("PAL failed to submit CMD!");
|
||||
return false;
|
||||
}
|
||||
// Make sure the slot isn't busy
|
||||
constexpr bool IbReuse = true;
|
||||
if (GPU_FLUSH_ON_EXECUTION) {
|
||||
waifForFence(cmdBufIdSlot_);
|
||||
waifForFence<!IbReuse>(cmdBufIdSlot_);
|
||||
}
|
||||
|
||||
// Reset the counter of commands
|
||||
@@ -271,7 +264,7 @@ bool VirtualGPU::Queue::flush() {
|
||||
|
||||
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
|
||||
// Wait for the last one
|
||||
waifForFence(cmdBufIdSlot_);
|
||||
waifForFence<!IbReuse>(cmdBufIdSlot_);
|
||||
cmdBufIdCurrent_ = 1;
|
||||
cmbBufIdRetired_ = 0;
|
||||
}
|
||||
@@ -279,9 +272,7 @@ bool VirtualGPU::Queue::flush() {
|
||||
// Wrap current slot
|
||||
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
|
||||
|
||||
// Make sure the slot isn't busy
|
||||
constexpr bool IbReuse = true;
|
||||
waifForFence(cmdBufIdSlot_, IbReuse);
|
||||
waifForFence<IbReuse>(cmdBufIdSlot_);
|
||||
|
||||
// Progress retired TS
|
||||
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
|
||||
@@ -312,7 +303,6 @@ bool VirtualGPU::Queue::flush() {
|
||||
if (it->second == cmdBufIdSlot_) {
|
||||
palMems_.push_back(it->first->iMem());
|
||||
residency_size_ -= it->first->iMem()->Desc().size;
|
||||
it->first->resident_--;
|
||||
it = memReferences_.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
@@ -333,7 +323,8 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
|
||||
}
|
||||
|
||||
uint slotId = id % MaxCmdBuffers;
|
||||
bool result = waifForFence(slotId);
|
||||
constexpr bool IbReuse = true;
|
||||
bool result = waifForFence<!IbReuse>(slotId);
|
||||
cmbBufIdRetired_ = id;
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -96,15 +96,22 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
}
|
||||
|
||||
// ibReuse forces event wait without polling, to make sure event occured
|
||||
bool waifForFence(uint cbId, bool ibReuse = false) const {
|
||||
template <bool ibReuse>
|
||||
bool waifForFence(uint cbId) const {
|
||||
Pal::Result result = Pal::Result::Success;
|
||||
uint64_t start = amd::Os::timeNanos();
|
||||
while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) {
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
if (!ibReuse) {
|
||||
start = amd::Os::timeNanos();
|
||||
}
|
||||
while (ibReuse || (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus()))) {
|
||||
if (result == Pal::Result::ErrorFenceNeverSubmitted) {
|
||||
result = Pal::Result::Success;
|
||||
break;
|
||||
}
|
||||
uint64_t end = amd::Os::timeNanos();
|
||||
if (!ibReuse) {
|
||||
end = amd::Os::timeNanos();
|
||||
}
|
||||
if (!ibReuse && ((end - start) < PollIntervalInNsec)) {
|
||||
amd::Os::yield();
|
||||
continue;
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele