SWDEV-301667 - Support device kernel args for PCIE
Change-Id: I5e51602bea5a68734227fd62e11ab68eb1ad81c1
[ROCm/clr commit: 5c591b5877]
Bu işleme şunda yer alıyor:
@@ -94,6 +94,7 @@ Settings::Settings() {
|
||||
fgs_kernel_arg_ = false;
|
||||
barrier_value_packet_ = false;
|
||||
|
||||
host_hdp_flush_ = true;
|
||||
gwsInitSupported_ = true;
|
||||
}
|
||||
|
||||
@@ -155,9 +156,13 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
|
||||
enableExtension(ClAmdFp64);
|
||||
}
|
||||
|
||||
if (gfxipMajor == 9 && gfxipMinor == 1 && gfxStepping == 0) {
|
||||
// Barrier Value packet is only supported on MI200 for now
|
||||
if ((gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10) ||
|
||||
((gfxipMajor == 9 && gfxipMinor == 4 &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
|
||||
// Enable Barrier Value packet is only for MI2XX/300
|
||||
barrier_value_packet_ = true;
|
||||
// On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
|
||||
host_hdp_flush_ = false;
|
||||
}
|
||||
|
||||
if (gfxipMajor >= 10) {
|
||||
|
||||
@@ -52,7 +52,8 @@ class Settings : public device::Settings {
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint reserved_ : 21;
|
||||
uint host_hdp_flush_ : 1; //!< Host HDP flush needed
|
||||
uint reserved_ : 20;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -3195,6 +3195,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
}
|
||||
}
|
||||
|
||||
constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
|
||||
const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
|
||||
|
||||
address argBuffer = hidden_arguments;
|
||||
// Find all parameters for the current kernel
|
||||
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
|
||||
@@ -3202,13 +3205,22 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
if(vcmd != nullptr && vcmd->getCapturingState()) {
|
||||
argBuffer = vcmd->getKernArgOffset();
|
||||
} else {
|
||||
argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(),
|
||||
gpuKernel.KernargSegmentAlignment()));
|
||||
const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
|
||||
sizeof(kSentinel) * pcieKernargs;
|
||||
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
|
||||
gpuKernel.KernargSegmentAlignment()));
|
||||
}
|
||||
// Load all kernel arguments
|
||||
nontemporalMemcpy(argBuffer, parameters,
|
||||
std::min(gpuKernel.KernargSegmentByteSize(),
|
||||
signature.paramsSize()));
|
||||
if (pcieKernargs) {
|
||||
nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
|
||||
&kSentinel, sizeof(kSentinel));
|
||||
if (dev().settings().host_hdp_flush_) {
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for group memory overflow
|
||||
@@ -3269,6 +3281,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
}
|
||||
if (pcieKernargs) {
|
||||
__builtin_ia32_mfence();
|
||||
while (*reinterpret_cast<volatile decltype(kSentinel)*>(
|
||||
argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
|
||||
}
|
||||
if (vcmd == nullptr) {
|
||||
// Dispatch the packet
|
||||
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle