SWDEV-443760 - Enable device kern args

- Implement workaround to ensure HDP writes are done by writing and
reading the HDP MMIO register.
- Implement the same workaround for graphs, we no longer need sentinel
write/readback

Change-Id: I0d3027b46a1f61131ec62e3c8c669ff5184fa6b2


[ROCm/clr commit: f138e0d113]
This commit is contained in:
Saleel Kudchadker
2024-02-10 00:51:14 +00:00
parent 6383be514b
commit ec59b1bc3e
5 changed files with 21 additions and 40 deletions
+5 -21
View File
@@ -359,8 +359,6 @@ hipError_t GraphExec::CaptureAQLPackets() {
auto device = g_devices[ihipGetDevice()]->devices()[0];
if (kernArgSizeForGraph != 0) {
if (device->info().largeBar_) {
// Pad kernel argument buffer with sentinal size bytes to do a readback later
kernArgSizeForGraph += sizeof(int);
kernarg_pool_graph_ =
reinterpret_cast<address>(device->deviceLocalAlloc(kernArgSizeForGraph));
device_kernarg_pool_ = true;
@@ -391,27 +389,13 @@ hipError_t GraphExec::CaptureAQLPackets() {
}
}
if (device_kernarg_pool_) {
// Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem
// by readback of sentinal value at the tail end of the kernarg surface (allocated above)
// This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI
// between CPU<->GPU
if (!device->isXgmi()) {
int host_val = 1;
address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int);
*(reinterpret_cast<int*>(dev_ptr)) = host_val;
if (device->info().hdpMemFlushCntl == nullptr) {
amd::Command* command = new amd::Marker(*capture_stream_, true);
if (command != nullptr) {
command->enqueue();
command->release();
}
} else {
*device->info().hdpMemFlushCntl = 1;
}
if (*(reinterpret_cast<volatile int*>(dev_ptr)) != host_val);
if (device_kernarg_pool_ && !device->isXgmi()) {
*device->info().hdpMemFlushCntl = 1u;
if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
LogError("Unexpected HDP Register readback value!");
}
}
ResetQueueIndex();
}
return status;
@@ -95,7 +95,7 @@ Settings::Settings() {
fgs_kernel_arg_ = false;
barrier_value_packet_ = false;
host_hdp_flush_ = true;
device_kernel_args_ = false;
gwsInitSupported_ = true;
limit_blit_wg_ = 16;
}
@@ -163,8 +163,10 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
// Enable Barrier Value packet is only for MI2XX/300
barrier_value_packet_ = true;
// On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
host_hdp_flush_ = false;
}
if (gfxipMajor >= 9) {
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
}
if (gfxipMajor >= 10) {
@@ -52,7 +52,7 @@ class Settings : public device::Settings {
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint host_hdp_flush_ : 1; //!< Host HDP flush needed
uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory
uint reserved_ : 20;
};
uint value_;
+9 -14
View File
@@ -1376,8 +1376,9 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
active_chunk_ = 0;
if (HIP_FORCE_DEV_KERNARG && roc_device_.info().largeBar_) {
kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
kernarg_pool_base_ =
reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
} else {
kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
Device::MemorySegment::kKernArg));
@@ -3208,8 +3209,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}
}
constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
const auto pcieKernargs = !dev().isXgmi() && dev().settings().device_kernel_args_;
address argBuffer = hidden_arguments;
// Find all parameters for the current kernel
@@ -3218,8 +3218,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if(vcmd != nullptr && vcmd->getCapturingState()) {
argBuffer = vcmd->getKernArgOffset();
} else {
const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
sizeof(kSentinel) * pcieKernargs;
const auto kernargSize = gpuKernel.KernargSegmentByteSize();
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
gpuKernel.KernargSegmentAlignment()));
}
@@ -3228,11 +3227,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
std::min(gpuKernel.KernargSegmentByteSize(),
signature.paramsSize()));
if (pcieKernargs) {
nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
&kSentinel, sizeof(kSentinel));
if (dev().settings().host_hdp_flush_) {
*dev().info().hdpMemFlushCntl = 1u;
}
*dev().info().hdpMemFlushCntl = 1u;
}
}
@@ -3295,9 +3290,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
if (pcieKernargs) {
__builtin_ia32_mfence();
while (*reinterpret_cast<volatile decltype(kSentinel)*>(
argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
LogError("Unexpected HDP Register readback value!");
}
}
if (vcmd == nullptr) {
// Dispatch the packet
+1 -1
View File
@@ -233,7 +233,7 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \
"Set this to true to force runtime unbundler in hiprtc.") \
release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
"Set initial heap size for device malloc.") \
release(bool, HIP_FORCE_DEV_KERNARG, 0, \
release(bool, HIP_FORCE_DEV_KERNARG, 1, \
"Force device mem for kernel args.") \
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
"Enable/Disable graph packet capturing") \