diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index b5ce63e867..cbd929cd34 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -94,6 +94,7 @@ Settings::Settings() { fgs_kernel_arg_ = false; barrier_value_packet_ = false; + host_hdp_flush_ = true; gwsInitSupported_ = true; } @@ -155,9 +156,13 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor enableExtension(ClAmdFp64); } - if (gfxipMajor == 9 && gfxipMinor == 1 && gfxStepping == 0) { - // Barrier Value packet is only supported on MI200 for now + if ((gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10) || + ((gfxipMajor == 9 && gfxipMinor == 4 && + (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) { + // Enable Barrier Value packet is only for MI2XX/300 barrier_value_packet_ = true; + // On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed + host_hdp_flush_ = false; } if (gfxipMajor >= 10) { diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index b17154d188..ced98dc03d 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -52,7 +52,8 @@ class Settings : public device::Settings { uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 21; + uint host_hdp_flush_ : 1; //!< Host HDP flush needed + uint reserved_ : 20; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index eedae46f82..652e195838 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -3195,6 +3195,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } } + constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull; + const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG; + address argBuffer = hidden_arguments; // Find all parameters for the current kernel if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { @@ -3202,13 +3205,22 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if(vcmd != nullptr && vcmd->getCapturingState()) { argBuffer = vcmd->getKernArgOffset(); } else { - argBuffer = reinterpret_cast
(allocKernArg(gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment())); + const auto kernargSize = gpuKernel.KernargSegmentByteSize() + + sizeof(kSentinel) * pcieKernargs; + argBuffer = reinterpret_cast
(allocKernArg(kernargSize, + gpuKernel.KernargSegmentAlignment())); } // Load all kernel arguments nontemporalMemcpy(argBuffer, parameters, std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize())); + if (pcieKernargs) { + nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(), + &kSentinel, sizeof(kSentinel)); + if (dev().settings().host_hdp_flush_) { + *dev().info().hdpMemFlushCntl = 1u; + } + } } // Check for group memory overflow @@ -3269,6 +3281,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; } + if (pcieKernargs) { + __builtin_ia32_mfence(); + while (*reinterpret_cast( + argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel); + } if (vcmd == nullptr) { // Dispatch the packet if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,