From 6a8bc3c71866e8cfc163b99305e4c38c5507b36d Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 11 Sep 2023 15:54:46 -0400 Subject: [PATCH] SWDEV-419688 - Do not run GWS init kernel for targets > gfx12 and MI300. Change-Id: I8e7441268978be71ab8a5a33e7f8bcf69660e500 (cherry picked from commit 36d37ef614909c0f215512aac0c133408d787080) --- rocclr/device/blitcl.cpp | 17 +++++++++++++++++ rocclr/device/device.hpp | 3 ++- rocclr/device/rocm/rocblit.cpp | 5 +++++ rocclr/device/rocm/rocdevice.cpp | 8 +++++++- rocclr/device/rocm/rocsettings.cpp | 13 ++++++------- rocclr/device/rocm/rocsettings.hpp | 3 +-- rocclr/device/rocm/rocvirtual.cpp | 2 +- 7 files changed, 39 insertions(+), 12 deletions(-) diff --git a/rocclr/device/blitcl.cpp b/rocclr/device/blitcl.cpp index 6d6a507092..dbcd8e475a 100644 --- a/rocclr/device/blitcl.cpp +++ b/rocclr/device/blitcl.cpp @@ -118,6 +118,23 @@ const char* HipExtraSourceCode = BLIT_KERNELS( } ); +const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS( + __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, + ulong value, ulong sizeBytes) { + __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes); + } + + __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, + ulong value, ulong flags, ulong mask) { + __amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask); + } + + __kernel void __amd_rocclr_initHeap(ulong heap_to_initialize, ulong initial_blocks, + uint heap_size, uint number_of_initial_blocks) { + __ockl_dm_init_v1(heap_to_initialize, initial_blocks, heap_size, number_of_initial_blocks); + } +); + const char* BlitImageSourceCode = BLIT_KERNELS( // Extern extern void __amd_fillImage(__write_only image2d_array_t, float4, int4, uint4, int4, int4, diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index d8dc24a22e..dc147aa775 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -659,7 +659,8 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 14; + uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine. + uint reserved_ : 10; }; uint value_; }; diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index d470a1f997..4c4549f5a3 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2785,6 +2785,11 @@ bool KernelBlitManager::RunGwsInit( uint32_t value) const { amd::ScopedLock k(lockXferOps_); + if (dev().settings().gwsInitSupported_ == false) { + LogError("GWS Init is not supported on this target"); + return false; + } + size_t globalWorkOffset[1] = { 0 }; size_t globalWorkSize[1] = { 1 }; size_t localWorkSize[1] = { 1 }; diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 0248d234ce..2d17a71c84 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -96,6 +96,7 @@ bool getValueFromIsaMeta(amd_comgr_metadata_node_t& isaMeta, const char* key, namespace device { extern const char* HipExtraSourceCode; +extern const char* HipExtraSourceCodeNoGWS; } // namespace device namespace roc { @@ -847,7 +848,11 @@ bool Device::createBlitProgram() { #if defined(USE_COMGR_LIBRARY) if (settings().useLightning_) { if (amd::IS_HIP) { - extraKernel = device::HipExtraSourceCode; + if (settings().gwsInitSupported_) { + extraKernel = device::HipExtraSourceCode; + } else { + extraKernel = device::HipExtraSourceCodeNoGWS; + } } else { extraKernel = SchedulerSourceCode; } @@ -1752,6 +1757,7 @@ bool Device::populateOCLDeviceConstants() { info_.sgprsPerSimd_ = std::numeric_limits::max(); // gfx10+ does not share SGPRs between waves } + return true; } diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp index ab59b07b85..b5ce63e867 100644 --- a/rocclr/device/rocm/rocsettings.cpp +++ b/rocclr/device/rocm/rocsettings.cpp @@ -92,9 +92,9 @@ Settings::Settings() { // Use coarse grain system memory for kernel arguments by default (to keep GPU cache) fgs_kernel_arg_ = false; - // by default for asics < gfx940 old single grid sync path is followed - coop_sync_ = false; barrier_value_packet_ = false; + + gwsInitSupported_ = true; } // ================================================================================================ @@ -173,13 +173,12 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE; } - // No GWS init kernel necessary for these archs - if ((gfxipMajor == 9 && gfxipMinor == 4) || gfxipMajor >= 11) { - coop_sync_ = true; - } - lcWavefrontSize64_ = !enableWave32Mode_; + if (gfxipMajor > 10 || (gfxipMajor == 9 && gfxipMinor == 4)) { + gwsInitSupported_ = false; + } + // Override current device settings override(); diff --git a/rocclr/device/rocm/rocsettings.hpp b/rocclr/device/rocm/rocsettings.hpp index 4f745f8521..b17154d188 100644 --- a/rocclr/device/rocm/rocsettings.hpp +++ b/rocclr/device/rocm/rocsettings.hpp @@ -51,9 +51,8 @@ class Settings : public device::Settings { uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment - uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 20; + uint reserved_ : 21; }; uint value_; }; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 7f0f433e76..95e07f3b79 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -3284,7 +3284,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Add a dependency into the device queue on the current queue queue->Barriers().AddExternalSignal(Barriers().GetLastSignal()); - if (!dev().settings().coop_sync_) { + if (dev().settings().gwsInitSupported_ == true) { uint32_t workgroups = vcmd.numWorkgroups(); static_cast(queue->blitMgr()).RunGwsInit(workgroups - 1); }