diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index ec42414f8d..dccf7044a7 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -395,19 +395,28 @@ hipError_t GraphExec::CaptureAQLPackets() { } } - if (device_kernarg_pool_ && !device->isXgmi()) { - if (device->info().hdpMemFlushCntl != nullptr) { + auto kernArgImpl = device->settings().kernel_arg_impl_; + + const auto applyMemOrderingWA = + ((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) || + (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) && + kernarg_pool_size_graph_ > 0; + + if (device_kernarg_pool_ && applyMemOrderingWA) { + address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_; + volatile char kSentinel = *(dev_ptr - 1); + // Memory ordering workaround for pcie: execute sfence followed by + // write the last byte of kernarg. + _mm_sfence(); + *(dev_ptr - 1) = kSentinel; + // HDP flush is required to guarantee ordering in Navi and MI100 + if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) { *device->info().hdpMemFlushCntl = 1u; - if (*device->info().hdpMemFlushCntl != UINT32_MAX) { - LogError("Unexpected HDP Register readback value!"); - } - } else { - amd::Command* command = new amd::Marker(*capture_stream_, true); - if (command != nullptr) { - command->enqueue(); - command->release(); - } } + // Memory ordering workaround for pcie: execute mfence followed by + // read of the last byte of kernarg. + _mm_mfence(); + kSentinel = *(dev_ptr - 1); } ResetQueueIndex(); diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index c2b7298722..2e8a90c441 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -647,6 +647,8 @@ struct GraphExec { // Capture GPU Packets from graph commands hipError_t CaptureAQLPackets(); hipError_t UpdateAQLPacket(hip::GraphKernelNode* node); + + using KernelArgImpl = device::Settings::KernelArgImpl; }; struct ChildGraphNode : public GraphNode { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index b4b1afd1af..edea84f6f8 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -651,6 +651,19 @@ struct Info : public amd::EmbeddedObject { //! Device settings class Settings : public amd::HeapObject { public: + + enum KernelArgImpl { + HostKernelArgs = 0, //!< Kernel Arguments are put into host memory + DeviceKernelArgs, //!< Device memory kernel arguments with no memory + //!< ordering workaround (e.g. XGMI) + DeviceKernelArgsReadback, //!< Device memory kernel arguments with kernel + //!< argument readback workaround (works only in + //!< ASICS >= MI200) + DeviceKernelArgsHDP //!< Device memory kernel arguments with kernel + //!< argument readback plus HDP flush workaround. + //!< Works in all ASICS. Requires a valid hdp flush register + }; + uint64_t extensions_; //!< Supported OCL extensions union { struct { @@ -675,7 +688,8 @@ class Settings : public amd::HeapObject { uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine. uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels - uint reserved_ : 9; + uint kernel_arg_impl_ : 2; //!< Kernel argument implementation + uint reserved_ : 7; }; uint value_; }; diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index c68eaff6f8..dd01e758d8 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -143,7 +143,9 @@ Settings::Settings() { alwaysResident_ = amd::IS_HIP ? true : false; prepinnedMinSize_ = 0; cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki; - useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG; + kernel_arg_impl_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) + ? KernelArgImpl::DeviceKernelArgs + : HIP_FORCE_DEV_KERNARG; limit_blit_wg_ = 16; DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index 942c5c91bd..e3cd2c7f39 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -79,8 +79,7 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint useDeviceKernelArg_ : 1; //!< Use persistent memory for kernel arguments - uint reserved_ : 9; + uint reserved_ : 10; }; uint value_; }; @@ -139,6 +138,8 @@ class Settings : public device::Settings { //! Overrides current settings based on registry/environment void override(); + + using KernelArgImpl = device::Settings::KernelArgImpl; }; /*@}*/ // namespace pal diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index a20577b24c..a5228f9b08 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -408,7 +408,7 @@ bool VirtualGPU::Queue::flush() { submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_]; if (iQueue_->Type() == Pal::QueueTypeCompute) { - if (settings.useDeviceKernelArg_) { + if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) { // If runtime uses device memory for kernel arguments, then perform a CPU read back on // submission. That will make sure NBIO puches all previous CPU write requests through PCIE gpu_.managedBuffer().CpuReadBack(); @@ -955,10 +955,12 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, } // Create buffers for kernel arg management - if (!managedBuffer_.create( - dev().settings().useDeviceKernelArg_ ? Resource::Persistent : Resource::RemoteUSWC)) { + if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ == + KernelArgImpl::DeviceKernelArgs + ? Resource::Persistent + : Resource::RemoteUSWC)) { // Try just USWC if persistent memory failed - if (dev().settings().useDeviceKernelArg_) { + if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) { if (!managedBuffer_.create(Resource::RemoteUSWC)) { return false; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index bf46db9272..58bdb307bc 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -721,6 +721,8 @@ class VirtualGPU : public device::VirtualDevice { MemoryRange sdmaRange_; //!< SDMA memory range for write access void* hostcallBuffer_; //!< Hostcall buffer + + using KernelArgImpl = device::Settings::KernelArgImpl; }; inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index ab0d00015d..e046700e92 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -141,7 +141,7 @@ bool NullDevice::create(const amd::Isa &isa) { roc::Settings* hsaSettings = new roc::Settings(); settings_ = hsaSettings; if (!hsaSettings || - !hsaSettings->create(false, isa.versionMajor(), isa.versionMinor(), isa.versionStepping(), + !hsaSettings->create(false, isa, isa.xnack() == amd::Isa::Feature::Enabled)) { LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId()); return false; @@ -734,23 +734,17 @@ bool Device::create() { info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL; info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL; - - bool device_kernel_args = true; - if (!isXgmi_ && ((info_.hdpMemFlushCntl == nullptr) || (info_.hdpRegFlushCntl == nullptr))) { - LogWarning("Unable to determine HDP flush register address. " - "Device kernel arguments are not supported"); - device_kernel_args = false; - } + bool hasValidHDPFlush = + (info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr); // Create HSA settings assert(!settings_); roc::Settings* hsaSettings = new roc::Settings(); settings_ = hsaSettings; if (!hsaSettings || - !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), isa->versionMajor(), - isa->versionMinor(), isa->versionStepping(), + !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa, isa->xnack() == amd::Isa::Feature::Enabled, - coop_groups, device_kernel_args)) { + coop_groups, isXgmi_, hasValidHDPFlush)) { LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_); return false; diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index fff1133f60..19bac547fd 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -95,15 +95,20 @@ Settings::Settings() { fgs_kernel_arg_ = false; barrier_value_packet_ = false; - device_kernel_args_ = false; + kernel_arg_impl_ = KernelArgImpl::HostKernelArgs; gwsInitSupported_ = true; limit_blit_wg_ = 16; } // ================================================================================================ -bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, - uint32_t gfxStepping, bool enableXNACK, bool coop_groups, - bool device_kernel_args) { +bool Settings::create(bool fullProfile, const amd::Isa& isa, + bool enableXNACK, bool coop_groups, + bool isXgmi, bool hasValidHDPFlush) { + + uint32_t gfxipMajor = isa.versionMajor(); + uint32_t gfxipMinor = isa.versionMinor(); + uint32_t gfxStepping = isa.versionStepping(); + customHostAllocator_ = false; if (fullProfile) { @@ -166,12 +171,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor barrier_value_packet_ = true; } - // Enable device kernel args for MI300* for now - if (gfxipMajor == 9 && gfxipMinor == 4 && - (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) { - device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args; - kernel_arg_opt_ = true; - } + setKernelArgImpl(isa, isXgmi, hasValidHDPFlush); if (gfxipMajor >= 10) { enableWave32Mode_ = true; @@ -235,14 +235,51 @@ void Settings::override() { fgs_kernel_arg_ = ROC_USE_FGS_KERNARG; } - if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) { - device_kernel_args_ = HIP_FORCE_DEV_KERNARG; - } - if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) { kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT; } } + +// ================================================================================================ +void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush) { + + const uint32_t gfxipMajor = isa.versionMajor(); + const uint32_t gfxipMinor = isa.versionMinor(); + const uint32_t gfxStepping = isa.versionStepping(); + + const bool isMI300 = gfxipMajor == 9 && gfxipMinor == 4 && + (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2); + const bool isMI200 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10); + const bool isMI100 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 8); + const bool isNavi = (gfxipMajor >= 10); + + auto kernelArgImpl = KernelArgImpl::HostKernelArgs; + + if (isXgmi) { + // The XGMI-connected path does not require the manual memory ordering + // workarounds that the PCIe connected path requires + kernelArgImpl = KernelArgImpl::DeviceKernelArgs; + } else if (isMI300 || isMI200) { + // Implement the kernel argument readback workaround. It works only on + // MI200, MI300 because of the strict guarantee on ordering of + // stores in those ASICS + kernelArgImpl = KernelArgImpl::DeviceKernelArgsReadback; + } else if (hasValidHDPFlush && (isNavi || isMI100)) { + // For dev >= gfx10 and MI100 ASICS implement the HDP flush to MMIO if the + // HDP flush register is valid + kernelArgImpl = KernelArgImpl::DeviceKernelArgsHDP; + } + + // Enable device kernel args for MI300* for now + if (isMI300) { + kernel_arg_impl_ = kernelArgImpl; + kernel_arg_opt_ = true; + } + + if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) { + kernel_arg_impl_ = kernelArgImpl & (HIP_FORCE_DEV_KERNARG ? 0xF : 0x0); + } +} } // namespace roc #endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index 4b6e384c48..7a250bb7be 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -52,8 +52,7 @@ class Settings : public device::Settings { uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint device_kernel_args_ : 1; //!< Allocate kernel args in device memory - uint reserved_ : 20; + uint reserved_ : 21; }; uint value_; }; @@ -83,9 +82,9 @@ class Settings : public device::Settings { Settings(); //! Creates settings - bool create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, uint32_t gfxStepping, - bool enableXNACK, bool coop_groups = false, - bool device_kernel_args = true); + bool create(bool fullProfile, const amd::Isa &isa, bool enableXNACK, + bool coop_groups = false, bool isXgmi = false, + bool hasValidHDPFlush = true); private: //! Disable copy constructor @@ -96,6 +95,10 @@ class Settings : public device::Settings { //! Overrides current settings based on registry/environment void override(); + + //! Determine how kernel arguments should be implemented given ASIC (host + //! memory, device memory, device memory with memory ordering workaround) + void setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush); }; /*@}*/} // namespace roc diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 53a8451f9c..f9f74313e2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1363,7 +1363,8 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) { kernarg_pool_size_ = kernarg_pool_size; kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal; active_chunk_ = 0; - if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) { + if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) && + roc_device_.info().largeBar_) { kernarg_pool_base_ = reinterpret_cast
(roc_device_.deviceLocalAlloc(kernarg_pool_size_)); } else { @@ -3201,11 +3202,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, } } - const auto pcieKernargs = !dev().isXgmi() && - dev().settings().device_kernel_args_ && - roc_device_.info().largeBar_; address argBuffer = hidden_arguments; bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState(); + size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize()); + + const auto kernArgImpl = dev().settings().kernel_arg_impl_; + const auto applyMemOrderingWA = + ((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) || + (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) && + roc_device_.info().largeBar_ && argSize > 0 && !isGraphCapture; // Find all parameters for the current kernel if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { @@ -3213,16 +3218,23 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if (isGraphCapture) { argBuffer = vcmd->getKernArgOffset(); } else { - const auto kernargSize = gpuKernel.KernargSegmentByteSize(); - argBuffer = reinterpret_cast(allocKernArg(kernargSize, - gpuKernel.KernargSegmentAlignment())); + + argBuffer = reinterpret_cast( + allocKernArg(gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment())); } - // Load all kernel arguments - nontemporalMemcpy(argBuffer, parameters, - std::min(gpuKernel.KernargSegmentByteSize(), - signature.paramsSize())); - if (pcieKernargs && !isGraphCapture) { - *dev().info().hdpMemFlushCntl = 1u; + + nontemporalMemcpy(argBuffer, parameters, argSize); + + if (applyMemOrderingWA) { + // Memory ordering workaround for pcie: execute sfence followed by + // write the last byte of kernarg + _mm_sfence(); + *(argBuffer + argSize - 1) = *(parameters + argSize - 1); + // HDP flush is required to guarantee ordering in Navi and MI100 + if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) { + *dev().info().hdpMemFlushCntl = 1u; + } } } @@ -3284,10 +3296,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; } - if (pcieKernargs && !isGraphCapture) { - if (*dev().info().hdpMemFlushCntl != UINT32_MAX) { - LogError("Unexpected HDP Register readback value!"); - } + if (applyMemOrderingWA) { + // Memory ordering workaround for pcie: execute mfence followed by + // read of the last byte of kernarg + _mm_mfence(); + volatile char kSentinel = *(argBuffer + argSize - 1); } if (vcmd == nullptr) { // Dispatch the packet diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index ea4e9943c9..f9f86207dc 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -570,5 +570,7 @@ class VirtualGPU : public device::VirtualDevice { bool fence_dirty_; //!< Fence modified flag std::atomic