SWDEV-301667 - Support device kernel args for PCIE

Change-Id: I5e51602bea5a68734227fd62e11ab68eb1ad81c1


[ROCm/clr commit: 5c591b5877]
Bu işleme şunda yer alıyor:
Saleel Kudchadker
2023-11-03 22:27:34 +00:00
ebeveyn d3bcc29cd6
işleme 153bb15f46
3 değiştirilmiş dosya ile 28 ekleme ve 5 silme
+7 -2
Dosyayı Görüntüle
@@ -94,6 +94,7 @@ Settings::Settings() {
fgs_kernel_arg_ = false;
barrier_value_packet_ = false;
host_hdp_flush_ = true;
gwsInitSupported_ = true;
}
@@ -155,9 +156,13 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
enableExtension(ClAmdFp64);
}
if (gfxipMajor == 9 && gfxipMinor == 1 && gfxStepping == 0) {
// Barrier Value packet is only supported on MI200 for now
if ((gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10) ||
((gfxipMajor == 9 && gfxipMinor == 4 &&
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
// Enable Barrier Value packet is only for MI2XX/300
barrier_value_packet_ = true;
// On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
host_hdp_flush_ = false;
}
if (gfxipMajor >= 10) {
+2 -1
Dosyayı Görüntüle
@@ -52,7 +52,8 @@ class Settings : public device::Settings {
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint reserved_ : 21;
uint host_hdp_flush_ : 1; //!< Host HDP flush needed
uint reserved_ : 20;
};
uint value_;
};
+19 -2
Dosyayı Görüntüle
@@ -3195,6 +3195,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}
}
constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
address argBuffer = hidden_arguments;
// Find all parameters for the current kernel
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
@@ -3202,13 +3205,22 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if(vcmd != nullptr && vcmd->getCapturingState()) {
argBuffer = vcmd->getKernArgOffset();
} else {
argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment()));
const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
sizeof(kSentinel) * pcieKernargs;
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
gpuKernel.KernargSegmentAlignment()));
}
// Load all kernel arguments
nontemporalMemcpy(argBuffer, parameters,
std::min(gpuKernel.KernargSegmentByteSize(),
signature.paramsSize()));
if (pcieKernargs) {
nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
&kSentinel, sizeof(kSentinel));
if (dev().settings().host_hdp_flush_) {
*dev().info().hdpMemFlushCntl = 1u;
}
}
}
// Check for group memory overflow
@@ -3269,6 +3281,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
if (pcieKernargs) {
__builtin_ia32_mfence();
while (*reinterpret_cast<volatile decltype(kSentinel)*>(
argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
}
if (vcmd == nullptr) {
// Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,