SWDEV-270773 - Fix workgroup calculation logic for GWS initialization

The existing workgroup calculation logic for GWS initialization is incorrect. It tries to add together workgroups across dimensions, leading to major under-count in 2D and 3D kernels. An (x,y,z) kernel uses x * y * z blocks, not x + y + z. In addition, the previous logic was incorrect for the case of launching a single-threaded kernel. It calculated 0 workgroups, leading to initializing GWS to -1. Change-Id: I1bb20a0d5b6e0cc10ac55901c28d8f93aac61c09 [ROCm/clr commit: 54d1d69c0a]
2021-01-31 01:16:48 -06:00
@@ -2331,10 +2331,10 @@ void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel&
 // ================================================================================================
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  if (vcmd.cooperativeGroups()) {
-    uint32_t workgroups = 0;
+    uint32_t workgroups = 1;
    for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
-      if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
-        workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
+      if (vcmd.sizes().local()[i] != 0) {
+        workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
      }
    }

@@ -2595,10 +2595,10 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

    if (vcmd.cooperativeGroups()) {
      // Initialize GWS if it's cooperative groups launch
-      uint32_t workgroups = 0;
+      uint32_t workgroups = 1;
      for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
-        if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
-          workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
+        if (vcmd.sizes().local()[i] != 0) {
+          workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
        }
      }