SWDEV-270773 - Fix workgroup calculation logic for GWS initialization
The existing workgroup calculation logic for GWS initialization is
incorrect. It tries to add together workgroups across dimensions,
leading to major under-count in 2D and 3D kernels. An (x,y,z) kernel
uses x * y * z blocks, not x + y + z.
In addition, the previous logic was incorrect for the case of launching
a single-threaded kernel. It calculated 0 workgroups, leading to
initializing GWS to -1.
Change-Id: I1bb20a0d5b6e0cc10ac55901c28d8f93aac61c09
[ROCm/clr commit: 54d1d69c0a]
Этот коммит содержится в:
@@ -2331,10 +2331,10 @@ void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel&
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
if (vcmd.cooperativeGroups()) {
|
||||
uint32_t workgroups = 0;
|
||||
uint32_t workgroups = 1;
|
||||
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
|
||||
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
|
||||
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
if (vcmd.sizes().local()[i] != 0) {
|
||||
workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2595,10 +2595,10 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
if (vcmd.cooperativeGroups()) {
|
||||
// Initialize GWS if it's cooperative groups launch
|
||||
uint32_t workgroups = 0;
|
||||
uint32_t workgroups = 1;
|
||||
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
|
||||
if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
|
||||
workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
if (vcmd.sizes().local()[i] != 0) {
|
||||
workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user