SWDEV-455254 - Add kernel arg optimization
Add kernel arguments optimization into blit path.
Enabled by default on MI300.
Change-Id: I2694a81b90d48ad07d86dfe4c0c64fe187bada8e
[ROCm/clr commit: f0c7ecf617]
Этот коммит содержится в:
@@ -455,6 +455,9 @@ bool Device::BlitProgram::create(amd::Device* device, const std::string& extraKe
|
||||
if (!GPU_DUMP_BLIT_KERNELS) {
|
||||
opt += " -fno-enable-dump";
|
||||
}
|
||||
if (device->settings().kernel_arg_opt_) {
|
||||
opt += " -Wb,-amdgpu-kernarg-preload-count=8 ";
|
||||
}
|
||||
if ((retval = program_->build(devices, opt.c_str(), nullptr, nullptr, GPU_DUMP_BLIT_KERNELS))
|
||||
!= CL_SUCCESS) {
|
||||
DevLogPrintfError("Build failed for Kernel: %s with error code %d\n",
|
||||
@@ -775,11 +778,11 @@ bool Device::disableP2P(amd::Device* ptrDev) {
|
||||
}
|
||||
|
||||
bool Device::UpdateStackSize(uint64_t stackSize) {
|
||||
// Amount of space used by each wave is in units of 256 dwords.
|
||||
// Amount of space used by each wave is in units of 256 dwords.
|
||||
// As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
|
||||
// The field size supports a range of 0->(2M-256) dwords per wave64.
|
||||
// The field size supports a range of 0->(2M-256) dwords per wave64.
|
||||
// Per lane this works out to 131056 bytes or 128K - 16
|
||||
uint64_t kStackSize = ((128 * Ki) - 16);
|
||||
uint64_t kStackSize = ((128 * Ki) - 16);
|
||||
if (stackSize > kStackSize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -674,7 +674,8 @@ class Settings : public amd::HeapObject {
|
||||
uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet
|
||||
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
|
||||
uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine.
|
||||
uint reserved_ : 10;
|
||||
uint kernel_arg_opt_: 1; //!< Enables kernel arg optimization for blit kernels
|
||||
uint reserved_ : 9;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -170,6 +170,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
|
||||
if (gfxipMajor == 9 && gfxipMinor == 4 &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
|
||||
device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
|
||||
kernel_arg_opt_ = true;
|
||||
}
|
||||
|
||||
if (gfxipMajor >= 10) {
|
||||
@@ -237,6 +238,10 @@ void Settings::override() {
|
||||
if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
|
||||
device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
|
||||
kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
|
||||
}
|
||||
}
|
||||
} // namespace roc
|
||||
|
||||
|
||||
@@ -223,6 +223,8 @@ release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
|
||||
"Initial size of HSA signal pool") \
|
||||
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
|
||||
"Limit the number of workgroups in blit operations") \
|
||||
release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \
|
||||
"Enable blit kernel arguments optimization") \
|
||||
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
|
||||
"If true, then runtime can skip kernel arg copy") \
|
||||
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
|
||||
|
||||
Ссылка в новой задаче
Block a user