SWDEV-451594 - Implement Readback and Avoid HDP Flush workaround for device kernel args

Change-Id: I6d41a089a17f55306e7ff402588a1e831b20a7a7
Этот коммит содержится в:
Ioannis Assiouras
2024-03-15 19:59:29 +00:00
родитель e829ef68e4
Коммит bf74ef4025
12 изменённых файлов: 147 добавлений и 66 удалений
+20 -11
Просмотреть файл
@@ -395,19 +395,28 @@ hipError_t GraphExec::CaptureAQLPackets() {
}
}
if (device_kernarg_pool_ && !device->isXgmi()) {
if (device->info().hdpMemFlushCntl != nullptr) {
auto kernArgImpl = device->settings().kernel_arg_impl_;
const auto applyMemOrderingWA =
((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
(kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
kernarg_pool_size_graph_ > 0;
if (device_kernarg_pool_ && applyMemOrderingWA) {
address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_;
volatile char kSentinel = *(dev_ptr - 1);
// Memory ordering workaround for pcie: execute sfence followed by
// write the last byte of kernarg.
_mm_sfence();
*(dev_ptr - 1) = kSentinel;
// HDP flush is required to guarantee ordering in Navi and MI100
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
*device->info().hdpMemFlushCntl = 1u;
if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
LogError("Unexpected HDP Register readback value!");
}
} else {
amd::Command* command = new amd::Marker(*capture_stream_, true);
if (command != nullptr) {
command->enqueue();
command->release();
}
}
// Memory ordering workaround for pcie: execute mfence followed by
// read of the last byte of kernarg.
_mm_mfence();
kSentinel = *(dev_ptr - 1);
}
ResetQueueIndex();
+2
Просмотреть файл
@@ -647,6 +647,8 @@ struct GraphExec {
// Capture GPU Packets from graph commands
hipError_t CaptureAQLPackets();
hipError_t UpdateAQLPacket(hip::GraphKernelNode* node);
using KernelArgImpl = device::Settings::KernelArgImpl;
};
struct ChildGraphNode : public GraphNode {
+15 -1
Просмотреть файл
@@ -651,6 +651,19 @@ struct Info : public amd::EmbeddedObject {
//! Device settings
class Settings : public amd::HeapObject {
public:
enum KernelArgImpl {
HostKernelArgs = 0, //!< Kernel Arguments are put into host memory
DeviceKernelArgs, //!< Device memory kernel arguments with no memory
//!< ordering workaround (e.g. XGMI)
DeviceKernelArgsReadback, //!< Device memory kernel arguments with kernel
//!< argument readback workaround (works only in
//!< ASICS >= MI200)
DeviceKernelArgsHDP //!< Device memory kernel arguments with kernel
//!< argument readback plus HDP flush workaround.
//!< Works in all ASICS. Requires a valid hdp flush register
};
uint64_t extensions_; //!< Supported OCL extensions
union {
struct {
@@ -675,7 +688,8 @@ class Settings : public amd::HeapObject {
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine.
uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels
uint reserved_ : 9;
uint kernel_arg_impl_ : 2; //!< Kernel argument implementation
uint reserved_ : 7;
};
uint value_;
};
+3 -1
Просмотреть файл
@@ -143,7 +143,9 @@ Settings::Settings() {
alwaysResident_ = amd::IS_HIP ? true : false;
prepinnedMinSize_ = 0;
cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
kernel_arg_impl_ = flagIsDefault(HIP_FORCE_DEV_KERNARG)
? KernelArgImpl::DeviceKernelArgs
: HIP_FORCE_DEV_KERNARG;
limit_blit_wg_ = 16;
DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
+3 -2
Просмотреть файл
@@ -79,8 +79,7 @@ class Settings : public device::Settings {
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint disableSdma_ : 1; //!< Disable SDMA support
uint alwaysResident_ : 1; //!< Make resources resident at allocation time
uint useDeviceKernelArg_ : 1; //!< Use persistent memory for kernel arguments
uint reserved_ : 9;
uint reserved_ : 10;
};
uint value_;
};
@@ -139,6 +138,8 @@ class Settings : public device::Settings {
//! Overrides current settings based on registry/environment
void override();
using KernelArgImpl = device::Settings::KernelArgImpl;
};
/*@}*/ // namespace pal
+6 -4
Просмотреть файл
@@ -408,7 +408,7 @@ bool VirtualGPU::Queue::flush() {
submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];
if (iQueue_->Type() == Pal::QueueTypeCompute) {
if (settings.useDeviceKernelArg_) {
if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
// If runtime uses device memory for kernel arguments, then perform a CPU read back on
// submission. That will make sure NBIO puches all previous CPU write requests through PCIE
gpu_.managedBuffer().CpuReadBack();
@@ -955,10 +955,12 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
}
// Create buffers for kernel arg management
if (!managedBuffer_.create(
dev().settings().useDeviceKernelArg_ ? Resource::Persistent : Resource::RemoteUSWC)) {
if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ ==
KernelArgImpl::DeviceKernelArgs
? Resource::Persistent
: Resource::RemoteUSWC)) {
// Try just USWC if persistent memory failed
if (dev().settings().useDeviceKernelArg_) {
if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
if (!managedBuffer_.create(Resource::RemoteUSWC)) {
return false;
}
+2
Просмотреть файл
@@ -721,6 +721,8 @@ class VirtualGPU : public device::VirtualDevice {
MemoryRange sdmaRange_; //!< SDMA memory range for write access
void* hostcallBuffer_; //!< Hostcall buffer
using KernelArgImpl = device::Settings::KernelArgImpl;
};
inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) {
+5 -11
Просмотреть файл
@@ -141,7 +141,7 @@ bool NullDevice::create(const amd::Isa &isa) {
roc::Settings* hsaSettings = new roc::Settings();
settings_ = hsaSettings;
if (!hsaSettings ||
!hsaSettings->create(false, isa.versionMajor(), isa.versionMinor(), isa.versionStepping(),
!hsaSettings->create(false, isa,
isa.xnack() == amd::Isa::Feature::Enabled)) {
LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId());
return false;
@@ -734,23 +734,17 @@ bool Device::create() {
info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL;
info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL;
bool device_kernel_args = true;
if (!isXgmi_ && ((info_.hdpMemFlushCntl == nullptr) || (info_.hdpRegFlushCntl == nullptr))) {
LogWarning("Unable to determine HDP flush register address. "
"Device kernel arguments are not supported");
device_kernel_args = false;
}
bool hasValidHDPFlush =
(info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr);
// Create HSA settings
assert(!settings_);
roc::Settings* hsaSettings = new roc::Settings();
settings_ = hsaSettings;
if (!hsaSettings ||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), isa->versionMajor(),
isa->versionMinor(), isa->versionStepping(),
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
isa->xnack() == amd::Isa::Feature::Enabled,
coop_groups, device_kernel_args)) {
coop_groups, isXgmi_, hasValidHDPFlush)) {
LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
pciDeviceId_);
return false;
+51 -14
Просмотреть файл
@@ -95,15 +95,20 @@ Settings::Settings() {
fgs_kernel_arg_ = false;
barrier_value_packet_ = false;
device_kernel_args_ = false;
kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
gwsInitSupported_ = true;
limit_blit_wg_ = 16;
}
// ================================================================================================
bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor,
uint32_t gfxStepping, bool enableXNACK, bool coop_groups,
bool device_kernel_args) {
bool Settings::create(bool fullProfile, const amd::Isa& isa,
bool enableXNACK, bool coop_groups,
bool isXgmi, bool hasValidHDPFlush) {
uint32_t gfxipMajor = isa.versionMajor();
uint32_t gfxipMinor = isa.versionMinor();
uint32_t gfxStepping = isa.versionStepping();
customHostAllocator_ = false;
if (fullProfile) {
@@ -166,12 +171,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
barrier_value_packet_ = true;
}
// Enable device kernel args for MI300* for now
if (gfxipMajor == 9 && gfxipMinor == 4 &&
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
kernel_arg_opt_ = true;
}
setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);
if (gfxipMajor >= 10) {
enableWave32Mode_ = true;
@@ -235,14 +235,51 @@ void Settings::override() {
fgs_kernel_arg_ = ROC_USE_FGS_KERNARG;
}
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
}
if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
}
}
// ================================================================================================
void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush) {
const uint32_t gfxipMajor = isa.versionMajor();
const uint32_t gfxipMinor = isa.versionMinor();
const uint32_t gfxStepping = isa.versionStepping();
const bool isMI300 = gfxipMajor == 9 && gfxipMinor == 4 &&
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
const bool isMI200 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
const bool isMI100 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 8);
const bool isNavi = (gfxipMajor >= 10);
auto kernelArgImpl = KernelArgImpl::HostKernelArgs;
if (isXgmi) {
// The XGMI-connected path does not require the manual memory ordering
// workarounds that the PCIe connected path requires
kernelArgImpl = KernelArgImpl::DeviceKernelArgs;
} else if (isMI300 || isMI200) {
// Implement the kernel argument readback workaround. It works only on
// MI200, MI300 because of the strict guarantee on ordering of
// stores in those ASICS
kernelArgImpl = KernelArgImpl::DeviceKernelArgsReadback;
} else if (hasValidHDPFlush && (isNavi || isMI100)) {
// For dev >= gfx10 and MI100 ASICS implement the HDP flush to MMIO if the
// HDP flush register is valid
kernelArgImpl = KernelArgImpl::DeviceKernelArgsHDP;
}
// Enable device kernel args for MI300* for now
if (isMI300) {
kernel_arg_impl_ = kernelArgImpl;
kernel_arg_opt_ = true;
}
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
kernel_arg_impl_ = kernelArgImpl & (HIP_FORCE_DEV_KERNARG ? 0xF : 0x0);
}
}
} // namespace roc
#endif // WITHOUT_HSA_BACKEND
+8 -5
Просмотреть файл
@@ -52,8 +52,7 @@ class Settings : public device::Settings {
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory
uint reserved_ : 20;
uint reserved_ : 21;
};
uint value_;
};
@@ -83,9 +82,9 @@ class Settings : public device::Settings {
Settings();
//! Creates settings
bool create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, uint32_t gfxStepping,
bool enableXNACK, bool coop_groups = false,
bool device_kernel_args = true);
bool create(bool fullProfile, const amd::Isa &isa, bool enableXNACK,
bool coop_groups = false, bool isXgmi = false,
bool hasValidHDPFlush = true);
private:
//! Disable copy constructor
@@ -96,6 +95,10 @@ class Settings : public device::Settings {
//! Overrides current settings based on registry/environment
void override();
//! Determine how kernel arguments should be implemented given ASIC (host
//! memory, device memory, device memory with memory ordering workaround)
void setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush);
};
/*@}*/} // namespace roc
+30 -17
Просмотреть файл
@@ -1363,7 +1363,8 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
active_chunk_ = 0;
if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
roc_device_.info().largeBar_) {
kernarg_pool_base_ =
reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
} else {
@@ -3201,11 +3202,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}
}
const auto pcieKernargs = !dev().isXgmi() &&
dev().settings().device_kernel_args_ &&
roc_device_.info().largeBar_;
address argBuffer = hidden_arguments;
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
const auto kernArgImpl = dev().settings().kernel_arg_impl_;
const auto applyMemOrderingWA =
((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
(kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
roc_device_.info().largeBar_ && argSize > 0 && !isGraphCapture;
// Find all parameters for the current kernel
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
@@ -3213,16 +3218,23 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if (isGraphCapture) {
argBuffer = vcmd->getKernArgOffset();
} else {
const auto kernargSize = gpuKernel.KernargSegmentByteSize();
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
gpuKernel.KernargSegmentAlignment()));
argBuffer = reinterpret_cast<address>(
allocKernArg(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment()));
}
// Load all kernel arguments
nontemporalMemcpy(argBuffer, parameters,
std::min(gpuKernel.KernargSegmentByteSize(),
signature.paramsSize()));
if (pcieKernargs && !isGraphCapture) {
*dev().info().hdpMemFlushCntl = 1u;
nontemporalMemcpy(argBuffer, parameters, argSize);
if (applyMemOrderingWA) {
// Memory ordering workaround for pcie: execute sfence followed by
// write the last byte of kernarg
_mm_sfence();
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
// HDP flush is required to guarantee ordering in Navi and MI100
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
*dev().info().hdpMemFlushCntl = 1u;
}
}
}
@@ -3284,10 +3296,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
if (pcieKernargs && !isGraphCapture) {
if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
LogError("Unexpected HDP Register readback value!");
}
if (applyMemOrderingWA) {
// Memory ordering workaround for pcie: execute mfence followed by
// read of the last byte of kernarg
_mm_mfence();
volatile char kSentinel = *(argBuffer + argSize - 1);
}
if (vcmd == nullptr) {
// Dispatch the packet
+2
Просмотреть файл
@@ -570,5 +570,7 @@ class VirtualGPU : public device::VirtualDevice {
bool fence_dirty_; //!< Fence modified flag
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
using KernelArgImpl = device::Settings::KernelArgImpl;
};
}