diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index 3202ca2fb8..a2c3532f5a 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -455,6 +455,9 @@ bool Device::BlitProgram::create(amd::Device* device, const std::string& extraKe if (!GPU_DUMP_BLIT_KERNELS) { opt += " -fno-enable-dump"; } + if (device->settings().kernel_arg_opt_) { + opt += " -Wb,-amdgpu-kernarg-preload-count=8 "; + } if ((retval = program_->build(devices, opt.c_str(), nullptr, nullptr, GPU_DUMP_BLIT_KERNELS)) != CL_SUCCESS) { DevLogPrintfError("Build failed for Kernel: %s with error code %d\n", @@ -775,11 +778,11 @@ bool Device::disableP2P(amd::Device* ptrDev) { } bool Device::UpdateStackSize(uint64_t stackSize) { - // Amount of space used by each wave is in units of 256 dwords. + // Amount of space used by each wave is in units of 256 dwords. // As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12 - // The field size supports a range of 0->(2M-256) dwords per wave64. + // The field size supports a range of 0->(2M-256) dwords per wave64. // Per lane this works out to 131056 bytes or 128K - 16 - uint64_t kStackSize = ((128 * Ki) - 16); + uint64_t kStackSize = ((128 * Ki) - 16); if (stackSize > kStackSize) { return false; } diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index c73b2b527a..401c6613d8 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -674,7 +674,8 @@ class Settings : public amd::HeapObject { uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine. - uint reserved_ : 10; + uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels + uint reserved_ : 9; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index a1688794ac..fff1133f60 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -170,6 +170,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor if (gfxipMajor == 9 && gfxipMinor == 4 && (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) { device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args; + kernel_arg_opt_ = true; } if (gfxipMajor >= 10) { @@ -237,6 +238,10 @@ void Settings::override() { if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) { device_kernel_args_ = HIP_FORCE_DEV_KERNARG; } + + if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) { + kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT; + } } } // namespace roc diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 0230eddb25..a5f43c1179 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -223,6 +223,8 @@ release(uint, ROC_SIGNAL_POOL_SIZE, 64, \ "Initial size of HSA signal pool") \ release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \ "Limit the number of workgroups in blit operations") \ +release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \ + "Enable blit kernel arguments optimization") \ release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \ "If true, then runtime can skip kernel arg copy") \ release(bool, GPU_STREAMOPS_CP_WAIT, false, \