From cfc07c88eeef5be596e4318831c25f0172725dd0 Mon Sep 17 00:00:00 2001 From: German Date: Fri, 8 Dec 2023 19:02:50 -0500 Subject: [PATCH] SWDEV-436796 - Enable device memory for kernel arguments Extra CPU read back will be performed before every submission to make sure previous writes over PCIE reached GPU. HDP flush is done by CP. Change-Id: I402d28ca26c8cee4a3920feb3599af8c285d0889 --- rocclr/device/pal/palconstbuf.hpp | 4 ++++ rocclr/device/pal/palsettings.cpp | 2 +- rocclr/device/pal/palvirtual.cpp | 18 ++++++++++++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/rocclr/device/pal/palconstbuf.hpp b/rocclr/device/pal/palconstbuf.hpp index 7acb8674c0..c9b72a4b40 100644 --- a/rocclr/device/pal/palconstbuf.hpp +++ b/rocclr/device/pal/palconstbuf.hpp @@ -65,6 +65,10 @@ class ManagedBuffer : public amd::EmbeddedObject { //! Returns VirtualGPU object this managed resource associated VirtualGPU& gpu() const { return gpu_; } + void CpuReadBack() const { + volatile auto tmp = *reinterpret_cast(pool_[activeBuffer_].buf->data()); + } + private: struct TimeStampedBuffer { Memory* buf; diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp index a4902e95f1..a5d08c50be 100644 --- a/rocclr/device/pal/palsettings.cpp +++ b/rocclr/device/pal/palsettings.cpp @@ -143,7 +143,7 @@ Settings::Settings() { alwaysResident_ = amd::IS_HIP ? true : false; prepinnedMinSize_ = 0; cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki; - useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG; + useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG; limit_blit_wg_ = 16; } diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index b38c210de8..56b734bced 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -370,8 +370,9 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b // ================================================================================================ bool VirtualGPU::Queue::flush() { amd::ScopedLock l(lock_); + const Settings& settings = gpu_.dev().settings(); - if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) { + if (!settings.alwaysResident_ && palMemRefs_.size() != 0) { if (Pal::Result::Success != iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_, Pal::GpuMemoryRefCantTrim)) { @@ -410,10 +411,15 @@ bool VirtualGPU::Queue::flush() { submitInfo.fenceCount = 1; submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_]; - if (amd::IS_HIP) { - // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers. - // Force CPU sync if there are pending operations on SDMA, until OS fences will be added - if (iQueue_->Type() == Pal::QueueTypeCompute) { + if (iQueue_->Type() == Pal::QueueTypeCompute) { + if (settings.useDeviceKernelArg_) { + // If runtime uses device memory for kernel arguments, then perform a CPU read back on + // submission. That will make sure NBIO puches all previous CPU write requests through PCIE + gpu_.managedBuffer().CpuReadBack(); + } + if (amd::IS_HIP) { + // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers. + // Force CPU sync if there are pending operations on SDMA, until OS fences will be added gpu_.WaitForIdleSdma(); } } @@ -487,7 +493,7 @@ bool VirtualGPU::Queue::flush() { } } } - if (!gpu_.dev().settings().alwaysResident_ && palMems_.size() != 0) { + if (!settings.alwaysResident_ && palMems_.size() != 0) { iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_); palMems_.clear(); }