SWDEV-455254 - Add kernel arg optimization

Add kernel arguments optimization into blit path.
Enabled by default on MI300.

Change-Id: I2694a81b90d48ad07d86dfe4c0c64fe187bada8e


[ROCm/clr commit: f0c7ecf617]
Этот коммит содержится в:
German Andryeyev
2024-04-09 18:21:37 -04:00
родитель ed9c629ad6
Коммит f29d608ca3
4 изменённых файлов: 15 добавлений и 4 удалений
+6 -3
Просмотреть файл
@@ -455,6 +455,9 @@ bool Device::BlitProgram::create(amd::Device* device, const std::string& extraKe
if (!GPU_DUMP_BLIT_KERNELS) {
opt += " -fno-enable-dump";
}
if (device->settings().kernel_arg_opt_) {
opt += " -Wb,-amdgpu-kernarg-preload-count=8 ";
}
if ((retval = program_->build(devices, opt.c_str(), nullptr, nullptr, GPU_DUMP_BLIT_KERNELS))
!= CL_SUCCESS) {
DevLogPrintfError("Build failed for Kernel: %s with error code %d\n",
@@ -775,11 +778,11 @@ bool Device::disableP2P(amd::Device* ptrDev) {
}
bool Device::UpdateStackSize(uint64_t stackSize) {
// Amount of space used by each wave is in units of 256 dwords.
// Amount of space used by each wave is in units of 256 dwords.
// As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
// The field size supports a range of 0->(2M-256) dwords per wave64.
// The field size supports a range of 0->(2M-256) dwords per wave64.
// Per lane this works out to 131056 bytes or 128K - 16
uint64_t kStackSize = ((128 * Ki) - 16);
uint64_t kStackSize = ((128 * Ki) - 16);
if (stackSize > kStackSize) {
return false;
}
+2 -1
Просмотреть файл
@@ -674,7 +674,8 @@ class Settings : public amd::HeapObject {
uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine.
uint reserved_ : 10;
uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels
uint reserved_ : 9;
};
uint value_;
};
+5
Просмотреть файл
@@ -170,6 +170,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
if (gfxipMajor == 9 && gfxipMinor == 4 &&
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
kernel_arg_opt_ = true;
}
if (gfxipMajor >= 10) {
@@ -237,6 +238,10 @@ void Settings::override() {
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
}
if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
}
}
} // namespace roc
+2
Просмотреть файл
@@ -223,6 +223,8 @@ release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
"Initial size of HSA signal pool") \
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
"Limit the number of workgroups in blit operations") \
release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \
"Enable blit kernel arguments optimization") \
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
"If true, then runtime can skip kernel arg copy") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \