SWDEV-443760 - Enable device kern args
- Implement workaround to ensure HDP writes are done by writing and
reading the HDP MMIO register.
- Implement the same workaround for graphs, we no longer need sentinel
write/readback
Change-Id: I0d3027b46a1f61131ec62e3c8c669ff5184fa6b2
[ROCm/clr commit: f138e0d113]
This commit is contained in:
@@ -359,8 +359,6 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
auto device = g_devices[ihipGetDevice()]->devices()[0];
|
||||
if (kernArgSizeForGraph != 0) {
|
||||
if (device->info().largeBar_) {
|
||||
// Pad kernel argument buffer with sentinal size bytes to do a readback later
|
||||
kernArgSizeForGraph += sizeof(int);
|
||||
kernarg_pool_graph_ =
|
||||
reinterpret_cast<address>(device->deviceLocalAlloc(kernArgSizeForGraph));
|
||||
device_kernarg_pool_ = true;
|
||||
@@ -391,27 +389,13 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
}
|
||||
}
|
||||
|
||||
if (device_kernarg_pool_) {
|
||||
// Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem
|
||||
// by readback of sentinal value at the tail end of the kernarg surface (allocated above)
|
||||
// This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI
|
||||
// between CPU<->GPU
|
||||
if (!device->isXgmi()) {
|
||||
int host_val = 1;
|
||||
address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int);
|
||||
*(reinterpret_cast<int*>(dev_ptr)) = host_val;
|
||||
if (device->info().hdpMemFlushCntl == nullptr) {
|
||||
amd::Command* command = new amd::Marker(*capture_stream_, true);
|
||||
if (command != nullptr) {
|
||||
command->enqueue();
|
||||
command->release();
|
||||
}
|
||||
} else {
|
||||
*device->info().hdpMemFlushCntl = 1;
|
||||
}
|
||||
if (*(reinterpret_cast<volatile int*>(dev_ptr)) != host_val);
|
||||
if (device_kernarg_pool_ && !device->isXgmi()) {
|
||||
*device->info().hdpMemFlushCntl = 1u;
|
||||
if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
|
||||
LogError("Unexpected HDP Register readback value!");
|
||||
}
|
||||
}
|
||||
|
||||
ResetQueueIndex();
|
||||
}
|
||||
return status;
|
||||
|
||||
@@ -95,7 +95,7 @@ Settings::Settings() {
|
||||
fgs_kernel_arg_ = false;
|
||||
barrier_value_packet_ = false;
|
||||
|
||||
host_hdp_flush_ = true;
|
||||
device_kernel_args_ = false;
|
||||
gwsInitSupported_ = true;
|
||||
limit_blit_wg_ = 16;
|
||||
}
|
||||
@@ -163,8 +163,10 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
|
||||
// Enable Barrier Value packet is only for MI2XX/300
|
||||
barrier_value_packet_ = true;
|
||||
// On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
|
||||
host_hdp_flush_ = false;
|
||||
}
|
||||
|
||||
if (gfxipMajor >= 9) {
|
||||
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
|
||||
}
|
||||
|
||||
if (gfxipMajor >= 10) {
|
||||
|
||||
@@ -52,7 +52,7 @@ class Settings : public device::Settings {
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint host_hdp_flush_ : 1; //!< Host HDP flush needed
|
||||
uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory
|
||||
uint reserved_ : 20;
|
||||
};
|
||||
uint value_;
|
||||
|
||||
@@ -1376,8 +1376,9 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
active_chunk_ = 0;
|
||||
if (HIP_FORCE_DEV_KERNARG && roc_device_.info().largeBar_) {
|
||||
kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
|
||||
if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
|
||||
kernarg_pool_base_ =
|
||||
reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
|
||||
} else {
|
||||
kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
|
||||
Device::MemorySegment::kKernArg));
|
||||
@@ -3208,8 +3209,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
}
|
||||
}
|
||||
|
||||
constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
|
||||
const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
|
||||
const auto pcieKernargs = !dev().isXgmi() && dev().settings().device_kernel_args_;
|
||||
|
||||
address argBuffer = hidden_arguments;
|
||||
// Find all parameters for the current kernel
|
||||
@@ -3218,8 +3218,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
if(vcmd != nullptr && vcmd->getCapturingState()) {
|
||||
argBuffer = vcmd->getKernArgOffset();
|
||||
} else {
|
||||
const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
|
||||
sizeof(kSentinel) * pcieKernargs;
|
||||
const auto kernargSize = gpuKernel.KernargSegmentByteSize();
|
||||
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
|
||||
gpuKernel.KernargSegmentAlignment()));
|
||||
}
|
||||
@@ -3228,11 +3227,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
std::min(gpuKernel.KernargSegmentByteSize(),
|
||||
signature.paramsSize()));
|
||||
if (pcieKernargs) {
|
||||
nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
|
||||
&kSentinel, sizeof(kSentinel));
|
||||
if (dev().settings().host_hdp_flush_) {
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
}
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3295,9 +3290,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
}
|
||||
if (pcieKernargs) {
|
||||
__builtin_ia32_mfence();
|
||||
while (*reinterpret_cast<volatile decltype(kSentinel)*>(
|
||||
argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
|
||||
if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
|
||||
LogError("Unexpected HDP Register readback value!");
|
||||
}
|
||||
}
|
||||
if (vcmd == nullptr) {
|
||||
// Dispatch the packet
|
||||
|
||||
@@ -233,7 +233,7 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \
|
||||
"Set this to true to force runtime unbundler in hiprtc.") \
|
||||
release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
|
||||
"Set initial heap size for device malloc.") \
|
||||
release(bool, HIP_FORCE_DEV_KERNARG, 0, \
|
||||
release(bool, HIP_FORCE_DEV_KERNARG, 1, \
|
||||
"Force device mem for kernel args.") \
|
||||
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
|
||||
"Enable/Disable graph packet capturing") \
|
||||
|
||||
Reference in New Issue
Block a user