SWDEV-451594 - Implement Readback and Avoid HDP Flush workaround for device kernel args
Change-Id: I6d41a089a17f55306e7ff402588a1e831b20a7a7
Этот коммит содержится в:
@@ -395,19 +395,28 @@ hipError_t GraphExec::CaptureAQLPackets() {
|
||||
}
|
||||
}
|
||||
|
||||
if (device_kernarg_pool_ && !device->isXgmi()) {
|
||||
if (device->info().hdpMemFlushCntl != nullptr) {
|
||||
auto kernArgImpl = device->settings().kernel_arg_impl_;
|
||||
|
||||
const auto applyMemOrderingWA =
|
||||
((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
|
||||
(kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
|
||||
kernarg_pool_size_graph_ > 0;
|
||||
|
||||
if (device_kernarg_pool_ && applyMemOrderingWA) {
|
||||
address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_;
|
||||
volatile char kSentinel = *(dev_ptr - 1);
|
||||
// Memory ordering workaround for pcie: execute sfence followed by
|
||||
// write the last byte of kernarg.
|
||||
_mm_sfence();
|
||||
*(dev_ptr - 1) = kSentinel;
|
||||
// HDP flush is required to guarantee ordering in Navi and MI100
|
||||
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
|
||||
*device->info().hdpMemFlushCntl = 1u;
|
||||
if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
|
||||
LogError("Unexpected HDP Register readback value!");
|
||||
}
|
||||
} else {
|
||||
amd::Command* command = new amd::Marker(*capture_stream_, true);
|
||||
if (command != nullptr) {
|
||||
command->enqueue();
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
// Memory ordering workaround for pcie: execute mfence followed by
|
||||
// read of the last byte of kernarg.
|
||||
_mm_mfence();
|
||||
kSentinel = *(dev_ptr - 1);
|
||||
}
|
||||
|
||||
ResetQueueIndex();
|
||||
|
||||
@@ -647,6 +647,8 @@ struct GraphExec {
|
||||
// Capture GPU Packets from graph commands
|
||||
hipError_t CaptureAQLPackets();
|
||||
hipError_t UpdateAQLPacket(hip::GraphKernelNode* node);
|
||||
|
||||
using KernelArgImpl = device::Settings::KernelArgImpl;
|
||||
};
|
||||
|
||||
struct ChildGraphNode : public GraphNode {
|
||||
|
||||
@@ -651,6 +651,19 @@ struct Info : public amd::EmbeddedObject {
|
||||
//! Device settings
|
||||
class Settings : public amd::HeapObject {
|
||||
public:
|
||||
|
||||
enum KernelArgImpl {
|
||||
HostKernelArgs = 0, //!< Kernel Arguments are put into host memory
|
||||
DeviceKernelArgs, //!< Device memory kernel arguments with no memory
|
||||
//!< ordering workaround (e.g. XGMI)
|
||||
DeviceKernelArgsReadback, //!< Device memory kernel arguments with kernel
|
||||
//!< argument readback workaround (works only in
|
||||
//!< ASICS >= MI200)
|
||||
DeviceKernelArgsHDP //!< Device memory kernel arguments with kernel
|
||||
//!< argument readback plus HDP flush workaround.
|
||||
//!< Works in all ASICS. Requires a valid hdp flush register
|
||||
};
|
||||
|
||||
uint64_t extensions_; //!< Supported OCL extensions
|
||||
union {
|
||||
struct {
|
||||
@@ -675,7 +688,8 @@ class Settings : public amd::HeapObject {
|
||||
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
|
||||
uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine.
|
||||
uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels
|
||||
uint reserved_ : 9;
|
||||
uint kernel_arg_impl_ : 2; //!< Kernel argument implementation
|
||||
uint reserved_ : 7;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -143,7 +143,9 @@ Settings::Settings() {
|
||||
alwaysResident_ = amd::IS_HIP ? true : false;
|
||||
prepinnedMinSize_ = 0;
|
||||
cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
|
||||
useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
|
||||
kernel_arg_impl_ = flagIsDefault(HIP_FORCE_DEV_KERNARG)
|
||||
? KernelArgImpl::DeviceKernelArgs
|
||||
: HIP_FORCE_DEV_KERNARG;
|
||||
|
||||
limit_blit_wg_ = 16;
|
||||
DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
|
||||
|
||||
@@ -79,8 +79,7 @@ class Settings : public device::Settings {
|
||||
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
|
||||
uint disableSdma_ : 1; //!< Disable SDMA support
|
||||
uint alwaysResident_ : 1; //!< Make resources resident at allocation time
|
||||
uint useDeviceKernelArg_ : 1; //!< Use persistent memory for kernel arguments
|
||||
uint reserved_ : 9;
|
||||
uint reserved_ : 10;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -139,6 +138,8 @@ class Settings : public device::Settings {
|
||||
|
||||
//! Overrides current settings based on registry/environment
|
||||
void override();
|
||||
|
||||
using KernelArgImpl = device::Settings::KernelArgImpl;
|
||||
};
|
||||
|
||||
/*@}*/ // namespace pal
|
||||
|
||||
@@ -408,7 +408,7 @@ bool VirtualGPU::Queue::flush() {
|
||||
submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];
|
||||
|
||||
if (iQueue_->Type() == Pal::QueueTypeCompute) {
|
||||
if (settings.useDeviceKernelArg_) {
|
||||
if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
|
||||
// If runtime uses device memory for kernel arguments, then perform a CPU read back on
|
||||
// submission. That will make sure NBIO puches all previous CPU write requests through PCIE
|
||||
gpu_.managedBuffer().CpuReadBack();
|
||||
@@ -955,10 +955,12 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
}
|
||||
|
||||
// Create buffers for kernel arg management
|
||||
if (!managedBuffer_.create(
|
||||
dev().settings().useDeviceKernelArg_ ? Resource::Persistent : Resource::RemoteUSWC)) {
|
||||
if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ ==
|
||||
KernelArgImpl::DeviceKernelArgs
|
||||
? Resource::Persistent
|
||||
: Resource::RemoteUSWC)) {
|
||||
// Try just USWC if persistent memory failed
|
||||
if (dev().settings().useDeviceKernelArg_) {
|
||||
if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
|
||||
if (!managedBuffer_.create(Resource::RemoteUSWC)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -721,6 +721,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
MemoryRange sdmaRange_; //!< SDMA memory range for write access
|
||||
|
||||
void* hostcallBuffer_; //!< Hostcall buffer
|
||||
|
||||
using KernelArgImpl = device::Settings::KernelArgImpl;
|
||||
};
|
||||
|
||||
inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) {
|
||||
|
||||
@@ -141,7 +141,7 @@ bool NullDevice::create(const amd::Isa &isa) {
|
||||
roc::Settings* hsaSettings = new roc::Settings();
|
||||
settings_ = hsaSettings;
|
||||
if (!hsaSettings ||
|
||||
!hsaSettings->create(false, isa.versionMajor(), isa.versionMinor(), isa.versionStepping(),
|
||||
!hsaSettings->create(false, isa,
|
||||
isa.xnack() == amd::Isa::Feature::Enabled)) {
|
||||
LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId());
|
||||
return false;
|
||||
@@ -734,23 +734,17 @@ bool Device::create() {
|
||||
|
||||
info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL;
|
||||
info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL;
|
||||
|
||||
bool device_kernel_args = true;
|
||||
if (!isXgmi_ && ((info_.hdpMemFlushCntl == nullptr) || (info_.hdpRegFlushCntl == nullptr))) {
|
||||
LogWarning("Unable to determine HDP flush register address. "
|
||||
"Device kernel arguments are not supported");
|
||||
device_kernel_args = false;
|
||||
}
|
||||
bool hasValidHDPFlush =
|
||||
(info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr);
|
||||
|
||||
// Create HSA settings
|
||||
assert(!settings_);
|
||||
roc::Settings* hsaSettings = new roc::Settings();
|
||||
settings_ = hsaSettings;
|
||||
if (!hsaSettings ||
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), isa->versionMajor(),
|
||||
isa->versionMinor(), isa->versionStepping(),
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
|
||||
isa->xnack() == amd::Isa::Feature::Enabled,
|
||||
coop_groups, device_kernel_args)) {
|
||||
coop_groups, isXgmi_, hasValidHDPFlush)) {
|
||||
LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
|
||||
@@ -95,15 +95,20 @@ Settings::Settings() {
|
||||
fgs_kernel_arg_ = false;
|
||||
barrier_value_packet_ = false;
|
||||
|
||||
device_kernel_args_ = false;
|
||||
kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
|
||||
gwsInitSupported_ = true;
|
||||
limit_blit_wg_ = 16;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor,
|
||||
uint32_t gfxStepping, bool enableXNACK, bool coop_groups,
|
||||
bool device_kernel_args) {
|
||||
bool Settings::create(bool fullProfile, const amd::Isa& isa,
|
||||
bool enableXNACK, bool coop_groups,
|
||||
bool isXgmi, bool hasValidHDPFlush) {
|
||||
|
||||
uint32_t gfxipMajor = isa.versionMajor();
|
||||
uint32_t gfxipMinor = isa.versionMinor();
|
||||
uint32_t gfxStepping = isa.versionStepping();
|
||||
|
||||
customHostAllocator_ = false;
|
||||
|
||||
if (fullProfile) {
|
||||
@@ -166,12 +171,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
|
||||
barrier_value_packet_ = true;
|
||||
}
|
||||
|
||||
// Enable device kernel args for MI300* for now
|
||||
if (gfxipMajor == 9 && gfxipMinor == 4 &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
|
||||
device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
|
||||
kernel_arg_opt_ = true;
|
||||
}
|
||||
setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);
|
||||
|
||||
if (gfxipMajor >= 10) {
|
||||
enableWave32Mode_ = true;
|
||||
@@ -235,14 +235,51 @@ void Settings::override() {
|
||||
fgs_kernel_arg_ = ROC_USE_FGS_KERNARG;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
|
||||
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
|
||||
kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush) {
|
||||
|
||||
const uint32_t gfxipMajor = isa.versionMajor();
|
||||
const uint32_t gfxipMinor = isa.versionMinor();
|
||||
const uint32_t gfxStepping = isa.versionStepping();
|
||||
|
||||
const bool isMI300 = gfxipMajor == 9 && gfxipMinor == 4 &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
|
||||
const bool isMI200 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
|
||||
const bool isMI100 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 8);
|
||||
const bool isNavi = (gfxipMajor >= 10);
|
||||
|
||||
auto kernelArgImpl = KernelArgImpl::HostKernelArgs;
|
||||
|
||||
if (isXgmi) {
|
||||
// The XGMI-connected path does not require the manual memory ordering
|
||||
// workarounds that the PCIe connected path requires
|
||||
kernelArgImpl = KernelArgImpl::DeviceKernelArgs;
|
||||
} else if (isMI300 || isMI200) {
|
||||
// Implement the kernel argument readback workaround. It works only on
|
||||
// MI200, MI300 because of the strict guarantee on ordering of
|
||||
// stores in those ASICS
|
||||
kernelArgImpl = KernelArgImpl::DeviceKernelArgsReadback;
|
||||
} else if (hasValidHDPFlush && (isNavi || isMI100)) {
|
||||
// For dev >= gfx10 and MI100 ASICS implement the HDP flush to MMIO if the
|
||||
// HDP flush register is valid
|
||||
kernelArgImpl = KernelArgImpl::DeviceKernelArgsHDP;
|
||||
}
|
||||
|
||||
// Enable device kernel args for MI300* for now
|
||||
if (isMI300) {
|
||||
kernel_arg_impl_ = kernelArgImpl;
|
||||
kernel_arg_opt_ = true;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
|
||||
kernel_arg_impl_ = kernelArgImpl & (HIP_FORCE_DEV_KERNARG ? 0xF : 0x0);
|
||||
}
|
||||
}
|
||||
} // namespace roc
|
||||
|
||||
#endif // WITHOUT_HSA_BACKEND
|
||||
|
||||
@@ -52,8 +52,7 @@ class Settings : public device::Settings {
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory
|
||||
uint reserved_ : 20;
|
||||
uint reserved_ : 21;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -83,9 +82,9 @@ class Settings : public device::Settings {
|
||||
Settings();
|
||||
|
||||
//! Creates settings
|
||||
bool create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, uint32_t gfxStepping,
|
||||
bool enableXNACK, bool coop_groups = false,
|
||||
bool device_kernel_args = true);
|
||||
bool create(bool fullProfile, const amd::Isa &isa, bool enableXNACK,
|
||||
bool coop_groups = false, bool isXgmi = false,
|
||||
bool hasValidHDPFlush = true);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
@@ -96,6 +95,10 @@ class Settings : public device::Settings {
|
||||
|
||||
//! Overrides current settings based on registry/environment
|
||||
void override();
|
||||
|
||||
//! Determine how kernel arguments should be implemented given ASIC (host
|
||||
//! memory, device memory, device memory with memory ordering workaround)
|
||||
void setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush);
|
||||
};
|
||||
|
||||
/*@}*/} // namespace roc
|
||||
|
||||
@@ -1363,7 +1363,8 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
active_chunk_ = 0;
|
||||
if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
|
||||
if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
|
||||
roc_device_.info().largeBar_) {
|
||||
kernarg_pool_base_ =
|
||||
reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
|
||||
} else {
|
||||
@@ -3201,11 +3202,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
}
|
||||
}
|
||||
|
||||
const auto pcieKernargs = !dev().isXgmi() &&
|
||||
dev().settings().device_kernel_args_ &&
|
||||
roc_device_.info().largeBar_;
|
||||
address argBuffer = hidden_arguments;
|
||||
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
|
||||
size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
|
||||
|
||||
const auto kernArgImpl = dev().settings().kernel_arg_impl_;
|
||||
const auto applyMemOrderingWA =
|
||||
((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
|
||||
(kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
|
||||
roc_device_.info().largeBar_ && argSize > 0 && !isGraphCapture;
|
||||
|
||||
// Find all parameters for the current kernel
|
||||
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
|
||||
@@ -3213,16 +3218,23 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
if (isGraphCapture) {
|
||||
argBuffer = vcmd->getKernArgOffset();
|
||||
} else {
|
||||
const auto kernargSize = gpuKernel.KernargSegmentByteSize();
|
||||
argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
|
||||
gpuKernel.KernargSegmentAlignment()));
|
||||
|
||||
argBuffer = reinterpret_cast<address>(
|
||||
allocKernArg(gpuKernel.KernargSegmentByteSize(),
|
||||
gpuKernel.KernargSegmentAlignment()));
|
||||
}
|
||||
// Load all kernel arguments
|
||||
nontemporalMemcpy(argBuffer, parameters,
|
||||
std::min(gpuKernel.KernargSegmentByteSize(),
|
||||
signature.paramsSize()));
|
||||
if (pcieKernargs && !isGraphCapture) {
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
|
||||
nontemporalMemcpy(argBuffer, parameters, argSize);
|
||||
|
||||
if (applyMemOrderingWA) {
|
||||
// Memory ordering workaround for pcie: execute sfence followed by
|
||||
// write the last byte of kernarg
|
||||
_mm_sfence();
|
||||
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
|
||||
// HDP flush is required to guarantee ordering in Navi and MI100
|
||||
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3284,10 +3296,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
}
|
||||
if (pcieKernargs && !isGraphCapture) {
|
||||
if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
|
||||
LogError("Unexpected HDP Register readback value!");
|
||||
}
|
||||
if (applyMemOrderingWA) {
|
||||
// Memory ordering workaround for pcie: execute mfence followed by
|
||||
// read of the last byte of kernarg
|
||||
_mm_mfence();
|
||||
volatile char kSentinel = *(argBuffer + argSize - 1);
|
||||
}
|
||||
if (vcmd == nullptr) {
|
||||
// Dispatch the packet
|
||||
|
||||
@@ -570,5 +570,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool fence_dirty_; //!< Fence modified flag
|
||||
|
||||
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
|
||||
|
||||
using KernelArgImpl = device::Settings::KernelArgImpl;
|
||||
};
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user