diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index e392c018f2..542028f300 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -267,7 +267,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M chunkSize = gpu().xferWrite().MaxSize(); } else { chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize()); - chunkSize = std::max(chunkSize, 128 * Ki); + chunkSize = std::max(chunkSize, 64 * Ki); bool flushDMA = true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp index c0dfd536e2..ccd6dfb583 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -11,32 +11,34 @@ namespace pal { // ================================================================================================ ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size) - : gpu_(gpu) - , buffers_(MaxNumberOfBuffers) - , activeBuffer_(0) - , size_(size) - , wrtOffset_(0) - , wrtAddress_(nullptr) {} + : gpu_(gpu) + , pool_(MaxNumberOfBuffers) + , activeBuffer_(0) + , size_(size) + , wrtOffset_(0) + , wrtAddress_(nullptr) {} // ================================================================================================ void ManagedBuffer::release() { - for (auto it : buffers_) { - if ((it != nullptr) && (it->data() != nullptr)) { - it->unmap(&gpu_); + for (auto it : pool_) { + if ((it.buf != nullptr) && (it.buf->data() != nullptr)) { + it.buf->unmap(&gpu_); } - delete it; + delete it.buf; } } // ================================================================================================ bool ManagedBuffer::create(Resource::MemoryType type) { - for (uint i = 0; i < buffers_.size(); ++i) { - buffers_[i] = new Memory(const_cast(gpu_.dev()), size_); - if (nullptr == buffers_[i] || !buffers_[i]->create(type)) { + for (uint i = 0; i < pool_.size(); ++i) { + pool_[i].buf = new Memory(const_cast(gpu_.dev()), size_); + if (nullptr == pool_[i].buf || !pool_[i].buf->create(type)) { LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); return false; } - void* wrtAddress = buffers_[i]->map(&gpu_); + // Assign virtual gpu to the allocation. Buffer will be used only on a particular queue + pool_[i].buf->memRef()->gpu_ = &gpu_; + void* wrtAddress = pool_[i].buf->map(&gpu_); if (wrtAddress == nullptr) { LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); return false; @@ -45,9 +47,9 @@ bool ManagedBuffer::create(Resource::MemoryType type) { uint dummy = 0; static constexpr bool Wait = true; // Write 0 for the buffer paging by VidMM - buffers_[i]->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait); + pool_[i].buf->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait); } - wrtAddress_ = buffers_[activeBuffer_]->data(); + wrtAddress_ = pool_[activeBuffer_].buf->data(); return true; } @@ -59,18 +61,22 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { // Align reserve size on the vector's boundary uint32_t count = amd::alignUp(size, MemAlignment); + // Save previous event + pinGpuEvent(); + // Check if buffer has enough space for reservation if ((wrtOffset_ + count) > size_) { // Get the next buffer in the list ++activeBuffer_; activeBuffer_ %= MaxNumberOfBuffers; // Make sure the buffer isn't busy - buffers_[activeBuffer_]->wait(gpu_); - wrtAddress_ = buffers_[activeBuffer_]->data(); + gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]); + gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]); + wrtAddress_ = pool_[activeBuffer_].buf->data(); wrtOffset_ = 0; } - *gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_; + *gpu_address = pool_[activeBuffer_].buf->vmAddress() + wrtOffset_; address cpu_address = wrtAddress_ + wrtOffset_; // Adjust the offset by the reserved size @@ -80,23 +86,17 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { } // ================================================================================================ -Memory& ManagedBuffer::reserveAtTheTop(uint32_t size) -{ - // Get the next buffer in the list - ++activeBuffer_; - activeBuffer_ %= MaxNumberOfBuffers; - // Make sure the buffer isn't busy - buffers_[activeBuffer_]->wait(gpu_); - wrtAddress_ = buffers_[activeBuffer_]->data(); - wrtOffset_ = 0; - return *buffers_[activeBuffer_]; +void ManagedBuffer::pinGpuEvent() { + GpuEvent* event = activeMemory()->getGpuEvent(gpu()); + pool_[activeBuffer_].events[event->engineId_] = *event; + activeMemory()->setBusy(gpu(), GpuEvent::InvalidID); } // ================================================================================================ ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size) - : mbuf_(mbuf) - , sys_mem_copy_(nullptr) - , size_(size) + : mbuf_(mbuf) + , sys_mem_copy_(nullptr) + , size_(size) {} // ================================================================================================ @@ -127,11 +127,11 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const { // ================================================================================================ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const { - uint64_t vm_address; - address cpu_address = mbuf_.reserve(size, &vm_address); - // Update memory with new CB data - memcpy(cpu_address, sysmem, size); - return vm_address; + uint64_t vm_address; + address cpu_address = mbuf_.reserve(size, &vm_address); + // Update memory with new CB data + memcpy(cpu_address, sysmem, size); + return vm_address; } // ================================================================================================ diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp index da1984f636..c1853b0537 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp @@ -30,9 +30,6 @@ class ManagedBuffer : public amd::EmbeddedObject { address reserve(uint32_t size, //!< real data size for upload uint64_t* gpu_address); - //! Reserves memory at the top of the active buffer - Memory& reserveAtTheTop(uint32_t size); - //! Returns CB size uint32_t size() const { return size_; } @@ -40,14 +37,23 @@ class ManagedBuffer : public amd::EmbeddedObject { uint32_t wrtOffset() const { return wrtOffset_; } //! Returns active GPU buffer - Memory* activeMemory() const { return buffers_[activeBuffer_]; } + Memory* activeMemory() const { return pool_[activeBuffer_].buf; } - uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); } + //! Retruns VM address for the active buffer + uint64_t vmAddress() const { return pool_[activeBuffer_].buf->vmAddress(); } + + //! Update the timestamp for the HW operation + void pinGpuEvent(); //! Returns VirtualGPU object this managed resource associated VirtualGPU& gpu() const { return gpu_; } private: + struct TimeStampedBuffer { + Memory* buf; + GpuEvent events[AllEngines]; + }; + //! The maximum number of the managed buffers static constexpr uint32_t MaxNumberOfBuffers = 3; @@ -58,7 +64,7 @@ class ManagedBuffer : public amd::EmbeddedObject { ManagedBuffer& operator=(const ManagedBuffer&) = delete; VirtualGPU& gpu_; //!< Virtual GPU object - std::vector buffers_; //!< Buffers for management + std::vector pool_; //!< Buffers for management uint32_t activeBuffer_; //!< Current active buffer uint32_t size_; //!< Constant buffer size uint32_t wrtOffset_; //!< Current write offset diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp index 6825c8b8c5..fbb95bcc80 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp @@ -145,6 +145,7 @@ class Memory : public device::Memory, public Resource { //! Quick view update for managed buffers. It should avoid expensive object allocations void updateView(Resource* view, size_t offset, size_t size) { size_ = size; + flags_ |= HostMemoryDirectAccess; Resource::updateView(view, offset, size); } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp index a838f2673a..3ae73d7ec6 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -378,6 +378,7 @@ class Resource : public amd::HeapObject { memRef_->retain(); desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + setBusy(*memRef()->gpu_, GpuEvent::InvalidID); } }