diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index 7e33caa2b1..e297766928 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -359,8 +359,6 @@ hipError_t GraphExec::CaptureAQLPackets() {
auto device = g_devices[ihipGetDevice()]->devices()[0];
if (kernArgSizeForGraph != 0) {
if (device->info().largeBar_) {
- // Pad kernel argument buffer with sentinal size bytes to do a readback later
- kernArgSizeForGraph += sizeof(int);
kernarg_pool_graph_ =
reinterpret_cast
(device->deviceLocalAlloc(kernArgSizeForGraph));
device_kernarg_pool_ = true;
@@ -391,27 +389,13 @@ hipError_t GraphExec::CaptureAQLPackets() {
}
}
- if (device_kernarg_pool_) {
- // Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem
- // by readback of sentinal value at the tail end of the kernarg surface (allocated above)
- // This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI
- // between CPU<->GPU
- if (!device->isXgmi()) {
- int host_val = 1;
- address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int);
- *(reinterpret_cast(dev_ptr)) = host_val;
- if (device->info().hdpMemFlushCntl == nullptr) {
- amd::Command* command = new amd::Marker(*capture_stream_, true);
- if (command != nullptr) {
- command->enqueue();
- command->release();
- }
- } else {
- *device->info().hdpMemFlushCntl = 1;
- }
- if (*(reinterpret_cast(dev_ptr)) != host_val);
+ if (device_kernarg_pool_ && !device->isXgmi()) {
+ *device->info().hdpMemFlushCntl = 1u;
+ if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
+ LogError("Unexpected HDP Register readback value!");
}
}
+
ResetQueueIndex();
}
return status;
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index dab0e5bc47..53729ece0a 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -95,7 +95,7 @@ Settings::Settings() {
fgs_kernel_arg_ = false;
barrier_value_packet_ = false;
- host_hdp_flush_ = true;
+ device_kernel_args_ = false;
gwsInitSupported_ = true;
limit_blit_wg_ = 16;
}
@@ -163,8 +163,10 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
// Enable Barrier Value packet is only for MI2XX/300
barrier_value_packet_ = true;
- // On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
- host_hdp_flush_ = false;
+ }
+
+ if (gfxipMajor >= 9) {
+ device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
}
if (gfxipMajor >= 10) {
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index 5ae3aaba1f..bd68da8fa8 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -52,7 +52,7 @@ class Settings : public device::Settings {
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
- uint host_hdp_flush_ : 1; //!< Host HDP flush needed
+ uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory
uint reserved_ : 20;
};
uint value_;
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 5ccd2701ff..34a9d2c580 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1376,8 +1376,9 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
active_chunk_ = 0;
- if (HIP_FORCE_DEV_KERNARG && roc_device_.info().largeBar_) {
- kernarg_pool_base_ = reinterpret_cast(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
+ if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
+ kernarg_pool_base_ =
+ reinterpret_cast(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
} else {
kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, 0,
Device::MemorySegment::kKernArg));
@@ -3208,8 +3209,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}
}
- constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
- const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
+ const auto pcieKernargs = !dev().isXgmi() && dev().settings().device_kernel_args_;
address argBuffer = hidden_arguments;
// Find all parameters for the current kernel
@@ -3218,8 +3218,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if(vcmd != nullptr && vcmd->getCapturingState()) {
argBuffer = vcmd->getKernArgOffset();
} else {
- const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
- sizeof(kSentinel) * pcieKernargs;
+ const auto kernargSize = gpuKernel.KernargSegmentByteSize();
argBuffer = reinterpret_cast(allocKernArg(kernargSize,
gpuKernel.KernargSegmentAlignment()));
}
@@ -3228,11 +3227,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
std::min(gpuKernel.KernargSegmentByteSize(),
signature.paramsSize()));
if (pcieKernargs) {
- nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
- &kSentinel, sizeof(kSentinel));
- if (dev().settings().host_hdp_flush_) {
- *dev().info().hdpMemFlushCntl = 1u;
- }
+ *dev().info().hdpMemFlushCntl = 1u;
}
}
@@ -3295,9 +3290,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
if (pcieKernargs) {
- __builtin_ia32_mfence();
- while (*reinterpret_cast(
- argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
+ if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
+ LogError("Unexpected HDP Register readback value!");
+ }
}
if (vcmd == nullptr) {
// Dispatch the packet
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 3bdc66a735..912f9d7a19 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -233,7 +233,7 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \
"Set this to true to force runtime unbundler in hiprtc.") \
release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
"Set initial heap size for device malloc.") \
-release(bool, HIP_FORCE_DEV_KERNARG, 0, \
+release(bool, HIP_FORCE_DEV_KERNARG, 1, \
"Force device mem for kernel args.") \
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
"Enable/Disable graph packet capturing") \