P4 to Git Change 1546657 by gandryey@gera-w8 on 2018/04/26 10:59:34

SWDEV-151739 - [CQE OCL][DTB][Perf][QR][DTB-BLOCKER][VEGA10] Upto 18% performance drop observed while running Video Composition test sub test of Compubench due to faulty CL#1544622
	- Implement customized TS tracking for managed buffers. The common TS tracking mechanism saves the event of the last command, assuming SDMA and compute operations occur in order, but for managed buffers it's not the case. Also managed buffer doesn't have to validate TS for the parent resource.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#22 edit


[ROCm/clr commit: 79ba5904dc]
Этот коммит содержится в:
foreman
2018-04-26 11:13:29 -04:00
родитель 7412bc74b3
Коммит dd4d5dcb94
5 изменённых файлов: 52 добавлений и 44 удалений
+1 -1
Просмотреть файл
@@ -267,7 +267,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
chunkSize = gpu().xferWrite().MaxSize();
} else {
chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
chunkSize = std::max(chunkSize, 128 * Ki);
chunkSize = std::max(chunkSize, 64 * Ki);
bool flushDMA = true;
}
+37 -37
Просмотреть файл
@@ -11,32 +11,34 @@ namespace pal {
// ================================================================================================
ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
: gpu_(gpu)
, buffers_(MaxNumberOfBuffers)
, activeBuffer_(0)
, size_(size)
, wrtOffset_(0)
, wrtAddress_(nullptr) {}
: gpu_(gpu)
, pool_(MaxNumberOfBuffers)
, activeBuffer_(0)
, size_(size)
, wrtOffset_(0)
, wrtAddress_(nullptr) {}
// ================================================================================================
void ManagedBuffer::release() {
for (auto it : buffers_) {
if ((it != nullptr) && (it->data() != nullptr)) {
it->unmap(&gpu_);
for (auto it : pool_) {
if ((it.buf != nullptr) && (it.buf->data() != nullptr)) {
it.buf->unmap(&gpu_);
}
delete it;
delete it.buf;
}
}
// ================================================================================================
bool ManagedBuffer::create(Resource::MemoryType type) {
for (uint i = 0; i < buffers_.size(); ++i) {
buffers_[i] = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
if (nullptr == buffers_[i] || !buffers_[i]->create(type)) {
for (uint i = 0; i < pool_.size(); ++i) {
pool_[i].buf = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
if (nullptr == pool_[i].buf || !pool_[i].buf->create(type)) {
LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
return false;
}
void* wrtAddress = buffers_[i]->map(&gpu_);
// Assign virtual gpu to the allocation. Buffer will be used only on a particular queue
pool_[i].buf->memRef()->gpu_ = &gpu_;
void* wrtAddress = pool_[i].buf->map(&gpu_);
if (wrtAddress == nullptr) {
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
return false;
@@ -45,9 +47,9 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
uint dummy = 0;
static constexpr bool Wait = true;
// Write 0 for the buffer paging by VidMM
buffers_[i]->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
pool_[i].buf->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
}
wrtAddress_ = buffers_[activeBuffer_]->data();
wrtAddress_ = pool_[activeBuffer_].buf->data();
return true;
}
@@ -59,18 +61,22 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
// Align reserve size on the vector's boundary
uint32_t count = amd::alignUp(size, MemAlignment);
// Save previous event
pinGpuEvent();
// Check if buffer has enough space for reservation
if ((wrtOffset_ + count) > size_) {
// Get the next buffer in the list
++activeBuffer_;
activeBuffer_ %= MaxNumberOfBuffers;
// Make sure the buffer isn't busy
buffers_[activeBuffer_]->wait(gpu_);
wrtAddress_ = buffers_[activeBuffer_]->data();
gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]);
wrtAddress_ = pool_[activeBuffer_].buf->data();
wrtOffset_ = 0;
}
*gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_;
*gpu_address = pool_[activeBuffer_].buf->vmAddress() + wrtOffset_;
address cpu_address = wrtAddress_ + wrtOffset_;
// Adjust the offset by the reserved size
@@ -80,23 +86,17 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
}
// ================================================================================================
Memory& ManagedBuffer::reserveAtTheTop(uint32_t size)
{
// Get the next buffer in the list
++activeBuffer_;
activeBuffer_ %= MaxNumberOfBuffers;
// Make sure the buffer isn't busy
buffers_[activeBuffer_]->wait(gpu_);
wrtAddress_ = buffers_[activeBuffer_]->data();
wrtOffset_ = 0;
return *buffers_[activeBuffer_];
void ManagedBuffer::pinGpuEvent() {
GpuEvent* event = activeMemory()->getGpuEvent(gpu());
pool_[activeBuffer_].events[event->engineId_] = *event;
activeMemory()->setBusy(gpu(), GpuEvent::InvalidID);
}
// ================================================================================================
ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
: mbuf_(mbuf)
, sys_mem_copy_(nullptr)
, size_(size)
: mbuf_(mbuf)
, sys_mem_copy_(nullptr)
, size_(size)
{}
// ================================================================================================
@@ -127,11 +127,11 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
// ================================================================================================
uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sysmem, size);
return vm_address;
uint64_t vm_address;
address cpu_address = mbuf_.reserve(size, &vm_address);
// Update memory with new CB data
memcpy(cpu_address, sysmem, size);
return vm_address;
}
// ================================================================================================
+12 -6
Просмотреть файл
@@ -30,9 +30,6 @@ class ManagedBuffer : public amd::EmbeddedObject {
address reserve(uint32_t size, //!< real data size for upload
uint64_t* gpu_address);
//! Reserves memory at the top of the active buffer
Memory& reserveAtTheTop(uint32_t size);
//! Returns CB size
uint32_t size() const { return size_; }
@@ -40,14 +37,23 @@ class ManagedBuffer : public amd::EmbeddedObject {
uint32_t wrtOffset() const { return wrtOffset_; }
//! Returns active GPU buffer
Memory* activeMemory() const { return buffers_[activeBuffer_]; }
Memory* activeMemory() const { return pool_[activeBuffer_].buf; }
uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); }
//! Retruns VM address for the active buffer
uint64_t vmAddress() const { return pool_[activeBuffer_].buf->vmAddress(); }
//! Update the timestamp for the HW operation
void pinGpuEvent();
//! Returns VirtualGPU object this managed resource associated
VirtualGPU& gpu() const { return gpu_; }
private:
struct TimeStampedBuffer {
Memory* buf;
GpuEvent events[AllEngines];
};
//! The maximum number of the managed buffers
static constexpr uint32_t MaxNumberOfBuffers = 3;
@@ -58,7 +64,7 @@ class ManagedBuffer : public amd::EmbeddedObject {
ManagedBuffer& operator=(const ManagedBuffer&) = delete;
VirtualGPU& gpu_; //!< Virtual GPU object
std::vector<Memory*> buffers_; //!< Buffers for management
std::vector<TimeStampedBuffer> pool_; //!< Buffers for management
uint32_t activeBuffer_; //!< Current active buffer
uint32_t size_; //!< Constant buffer size
uint32_t wrtOffset_; //!< Current write offset
+1
Просмотреть файл
@@ -145,6 +145,7 @@ class Memory : public device::Memory, public Resource {
//! Quick view update for managed buffers. It should avoid expensive object allocations
void updateView(Resource* view, size_t offset, size_t size) {
size_ = size;
flags_ |= HostMemoryDirectAccess;
Resource::updateView(view, offset, size);
}
+1
Просмотреть файл
@@ -378,6 +378,7 @@ class Resource : public amd::HeapObject {
memRef_->retain();
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
}
}