From ec59b1bc3ec5c8bcceeb6bbbf753c10cba3af31d Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Sat, 10 Feb 2024 00:51:14 +0000 Subject: [PATCH] SWDEV-443760 - Enable device kern args - Implement workaround to ensure HDP writes are done by writing and reading the HDP MMIO register. - Implement the same workaround for graphs, we no longer need sentinel write/readback Change-Id: I0d3027b46a1f61131ec62e3c8c669ff5184fa6b2 [ROCm/clr commit: f138e0d11380f0fa2fc5ae9a990b09743b2dc047] --- .../clr/hipamd/src/hip_graph_internal.cpp | 26 ++++--------------- .../clr/rocclr/device/rocm/rocsettings.cpp | 8 +++--- .../clr/rocclr/device/rocm/rocsettings.hpp | 2 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 23 +++++++--------- projects/clr/rocclr/utils/flags.hpp | 2 +- 5 files changed, 21 insertions(+), 40 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 7e33caa2b1..e297766928 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -359,8 +359,6 @@ hipError_t GraphExec::CaptureAQLPackets() { auto device = g_devices[ihipGetDevice()]->devices()[0]; if (kernArgSizeForGraph != 0) { if (device->info().largeBar_) { - // Pad kernel argument buffer with sentinal size bytes to do a readback later - kernArgSizeForGraph += sizeof(int); kernarg_pool_graph_ = reinterpret_cast
(device->deviceLocalAlloc(kernArgSizeForGraph)); device_kernarg_pool_ = true; @@ -391,27 +389,13 @@ hipError_t GraphExec::CaptureAQLPackets() { } } - if (device_kernarg_pool_) { - // Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem - // by readback of sentinal value at the tail end of the kernarg surface (allocated above) - // This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI - // between CPU<->GPU - if (!device->isXgmi()) { - int host_val = 1; - address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int); - *(reinterpret_cast(dev_ptr)) = host_val; - if (device->info().hdpMemFlushCntl == nullptr) { - amd::Command* command = new amd::Marker(*capture_stream_, true); - if (command != nullptr) { - command->enqueue(); - command->release(); - } - } else { - *device->info().hdpMemFlushCntl = 1; - } - if (*(reinterpret_cast(dev_ptr)) != host_val); + if (device_kernarg_pool_ && !device->isXgmi()) { + *device->info().hdpMemFlushCntl = 1u; + if (*device->info().hdpMemFlushCntl != UINT32_MAX) { + LogError("Unexpected HDP Register readback value!"); } } + ResetQueueIndex(); } return status; diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index dab0e5bc47..53729ece0a 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -95,7 +95,7 @@ Settings::Settings() { fgs_kernel_arg_ = false; barrier_value_packet_ = false; - host_hdp_flush_ = true; + device_kernel_args_ = false; gwsInitSupported_ = true; limit_blit_wg_ = 16; } @@ -163,8 +163,10 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) { // Enable Barrier Value packet is only for MI2XX/300 barrier_value_packet_ = true; - // On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed - host_hdp_flush_ = false; + } + + if (gfxipMajor >= 9) { + device_kernel_args_ = HIP_FORCE_DEV_KERNARG; } if (gfxipMajor >= 10) { diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index 5ae3aaba1f..bd68da8fa8 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -52,7 +52,7 @@ class Settings : public device::Settings { uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint host_hdp_flush_ : 1; //!< Host HDP flush needed + uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory uint reserved_ : 20; }; uint value_; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 5ccd2701ff..34a9d2c580 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1376,8 +1376,9 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) { kernarg_pool_size_ = kernarg_pool_size; kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal; active_chunk_ = 0; - if (HIP_FORCE_DEV_KERNARG && roc_device_.info().largeBar_) { - kernarg_pool_base_ = reinterpret_cast
(roc_device_.deviceLocalAlloc(kernarg_pool_size_)); + if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) { + kernarg_pool_base_ = + reinterpret_cast
(roc_device_.deviceLocalAlloc(kernarg_pool_size_)); } else { kernarg_pool_base_ = reinterpret_cast
(roc_device_.hostAlloc(kernarg_pool_size_, 0, Device::MemorySegment::kKernArg)); @@ -3208,8 +3209,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } } - constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull; - const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG; + const auto pcieKernargs = !dev().isXgmi() && dev().settings().device_kernel_args_; address argBuffer = hidden_arguments; // Find all parameters for the current kernel @@ -3218,8 +3218,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if(vcmd != nullptr && vcmd->getCapturingState()) { argBuffer = vcmd->getKernArgOffset(); } else { - const auto kernargSize = gpuKernel.KernargSegmentByteSize() + - sizeof(kSentinel) * pcieKernargs; + const auto kernargSize = gpuKernel.KernargSegmentByteSize(); argBuffer = reinterpret_cast
(allocKernArg(kernargSize, gpuKernel.KernargSegmentAlignment())); } @@ -3228,11 +3227,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize())); if (pcieKernargs) { - nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(), - &kSentinel, sizeof(kSentinel)); - if (dev().settings().host_hdp_flush_) { - *dev().info().hdpMemFlushCntl = 1u; - } + *dev().info().hdpMemFlushCntl = 1u; } } @@ -3295,9 +3290,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; } if (pcieKernargs) { - __builtin_ia32_mfence(); - while (*reinterpret_cast( - argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel); + if (*dev().info().hdpMemFlushCntl != UINT32_MAX) { + LogError("Unexpected HDP Register readback value!"); + } } if (vcmd == nullptr) { // Dispatch the packet diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 3bdc66a735..912f9d7a19 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -233,7 +233,7 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \ "Set this to true to force runtime unbundler in hiprtc.") \ release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \ "Set initial heap size for device malloc.") \ -release(bool, HIP_FORCE_DEV_KERNARG, 0, \ +release(bool, HIP_FORCE_DEV_KERNARG, 1, \ "Force device mem for kernel args.") \ release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \ "Enable/Disable graph packet capturing") \