diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index c77a38b55d..1b16da57eb 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -2379,9 +2379,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return false; } - // Add ISA memory object to the resource tracking list - AddKernel(kernel); - uint64_t vmDefQueue = 0; VirtualGPU* gpuDefQueue = nullptr; if (hsaKernel.dynamicParallelism()) { @@ -2400,6 +2397,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return false; } + // Add ISA memory object to the resource tracking list + AddKernel(kernel); + bool needFlush = false; // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd // are in the same cmdBuffer @@ -3507,7 +3507,11 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); // Validate scratch buffer to force sync mode, because // the current scratch logic is optimized for size and performance - memoryDependency().validate(*this, scratch->memObj_, IsReadOnly); + // Note: runtime can skip sync if the same kernel is used, + // since the number of scratch regs remains the same + if (!IsSameKernel(kernel)) { + memoryDependency().validate(*this, scratch->memObj_, IsReadOnly); + } addVmMemory(scratch->memObj_); } diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 006251c6b4..5d22ef0482 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -377,6 +377,10 @@ class VirtualGPU : public device::VirtualDevice { inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object ) const; + //! Checks if runtime dispatches the same kernel as previously + inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object + ) const; + //! Adds a dopp desktop texture reference void addDoppRef(const Memory* memory, //!< GPU memory object bool lastDoopCmd, //!< is the last submission for the pre-present primary @@ -652,6 +656,10 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const { queues_[MainEngine]->last_kernel_ = &kernel; } +inline bool VirtualGPU::IsSameKernel(const amd::Kernel& kernel) const { + return (queues_[MainEngine]->last_kernel_ == &kernel) ? true : false; +} + template uint VirtualGPU::Queue::submit(bool forceFlush) { cmdCnt_++; uint id = cmdBufIdCurrent_;