SWDEV-493805 - Cleaning up launch parameters arguments. (#241)

[ROCm/clr commit: fa55557f46]
2025-05-06 15:06:13 -04:00
@@ -41,11 +41,9 @@ void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& s

 bool IsHtoHMemcpy(void* dst, const void* src);

-hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
-                                     uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
-                                     uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
-                                     uint32_t sharedMemBytes, void** kernelParams, void** extra,
-                                     int deviceId, uint32_t params);
+hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
+                                     void** kernelParams, void** extra, int deviceId,
+                                     uint32_t params);

 hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize, size_t sizeBytes);

@@ -53,9 +51,7 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte
                                 size_t sizeBytes);

 hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
-                                   uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
-                                   uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
-                                   uint32_t blockDimZ, uint32_t sharedMemBytes, hip::Stream* stream,
+                                   amd::LaunchParams& launch_params, hip::Stream* stream,
                                   void** kernelParams, void** extra, hipEvent_t startEvent,
                                   hipEvent_t stopEvent, uint32_t flags, uint32_t params,
                                   uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum,
@@ -1176,13 +1176,19 @@ class GraphKernelNode : public GraphNode {
        }
      }
    }
+
+    amd::HIPLaunchParams launch_params(kernelParams_.gridDim.x, kernelParams_.gridDim.y,
+                                       kernelParams_.gridDim.z, kernelParams_.blockDim.x,
+                                       kernelParams_.blockDim.y, kernelParams_.blockDim.z,
+                                       kernelParams_.sharedMemBytes);
+
+    if (!launch_params.IsValidConfig()) {
+      return hipErrorInvalidConfiguration;
+    }
+
    status = ihipLaunchKernelCommand(
-        command, func, kernelParams_.gridDim.x * kernelParams_.blockDim.x,
-        kernelParams_.gridDim.y * kernelParams_.blockDim.y,
-        kernelParams_.gridDim.z * kernelParams_.blockDim.z, kernelParams_.blockDim.x,
-        kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes, stream,
-        kernelParams_.kernelParams, kernelParams_.extra, kernelEvents_.startEvent_,
-        kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
+        command, func, launch_params, stream, kernelParams_.kernelParams, kernelParams_.extra,
+        kernelEvents_.startEvent_, kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
    if (signal_is_required_) {
      // Optimize the barriers by adding a signal into the dispatch packet directly
      command->SetProfiling();
@@ -1315,15 +1321,18 @@ class GraphKernelNode : public GraphNode {

  static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
                                         hipFunction_t func, int devId) {
-    size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
-    size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
-    size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;

-    hipError_t status = ihipLaunchKernel_validate(
-        func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
-        static_cast<uint32_t>(globalWorkSizeZ), pNodeParams->blockDim.x, pNodeParams->blockDim.y,
-        pNodeParams->blockDim.z, pNodeParams->sharedMemBytes, pNodeParams->kernelParams,
-        pNodeParams->extra, devId, 0);
+    amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y,
+                                       pNodeParams->gridDim.z, pNodeParams->blockDim.x,
+                                       pNodeParams->blockDim.y, pNodeParams->blockDim.z,
+                                       pNodeParams->sharedMemBytes);
+
+    if (!launch_params.IsValidConfig()) {
+      HIP_RETURN(hipErrorInvalidConfiguration);
+    }
+
+    hipError_t status = ihipLaunchKernel_validate(func, launch_params, pNodeParams->kernelParams,
+                                                  pNodeParams->extra, devId, 0);
    if (status != hipSuccess) {
      return status;
    }
@@ -23,6 +23,7 @@
 #include <fstream>

 #include "hip_internal.hpp"
+#include "platform/ndrange.hpp"
 #include "platform/program.hpp"
 #include "hip_event.hpp"
 #include "hip_platform.hpp"
@@ -255,11 +256,9 @@ hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config
  HIP_RETURN(hipSuccess);
 }

-hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
-                                     uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
-                                     uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
-                                     uint32_t sharedMemBytes, void** kernelParams, void** extra,
-                                     int deviceId, uint32_t params = 0) {
+hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
+                                     void** kernelParams, void** extra, int deviceId,
+                                     uint32_t params = 0) {
  if (f == nullptr) {
    LogPrintfError("%s", "Function passed is null");
    return hipErrorInvalidImage;
@@ -270,21 +269,23 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
    return hipErrorInvalidValue;
  }

-  if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
+                                     || launch_params.global_[2] == 0) {
    return hipErrorInvalidValue;
  }

-  if (blockDimX == 0 || blockDimY == 0 || blockDimZ == 0) {
+  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
+                                    || launch_params.local_[2] == 0) {
    return hipErrorInvalidConfiguration;
  }

  const amd::Device* device = g_devices[deviceId]->devices()[0];
  const auto& info = device->info();
-  if (sharedMemBytes > info.localMemSizePerCU_) { //sharedMemPerBlock
+  if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { //sharedMemPerBlock
    return hipErrorInvalidValue;
  }
  // Make sure dispatch doesn't exceed max workgroup size limit
-  if (blockDimX * blockDimY * blockDimZ > info.maxWorkGroupSize_) {
+  if (launch_params.local_.product() > info.maxWorkGroupSize_) {
    return (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_LAUNCH_KERNEL) ?
                                  hipErrorInvalidConfiguration :
                                  hipErrorInvalidValue;
@@ -301,10 +302,9 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
  }
  // Make sure the launch params are not larger than if specified launch_bounds
  // If it exceeds, then return a failure
-  if (blockDimX * blockDimY * blockDimZ >
-      kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
+  if (launch_params.local_.product() > kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
    LogPrintfError("Launch params (%u, %u, %u) are larger than launch bounds (%lu) for kernel %s",
-                   blockDimX, blockDimY, blockDimZ,
+                   launch_params.local_[0], launch_params.local_[1], launch_params.local_[2],
                   kernel->getDeviceKernel(*device)->workGroupInfo()->size_,
                   function->name().c_str());
    return hipErrorLaunchFailure;
@@ -317,15 +317,14 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
    int num_blocks = 0;
    int max_blocks_per_grid = 0;
    int best_block_size = 0;
-    int block_size = blockDimX * blockDimY * blockDimZ;
+    int block_size = launch_params.local_.product();
    hipError_t err = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, sharedMemBytes,
-        true);
+        &num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size,
+        launch_params.sharedMemBytes_, true);
    if (err != hipSuccess) {
      return err;
    }
-    if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) >
-        unsigned(max_blocks_per_grid)) {
+    if (((launch_params.global_.product()) / block_size) > unsigned(max_blocks_per_grid)) {
      return hipErrorCooperativeLaunchTooLarge;
    }
  }
@@ -338,10 +337,8 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
 }

 hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
-                                   uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
-                                   uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
-                                   uint32_t blockDimZ, uint32_t sharedMemBytes,
-                                   hip::Stream* stream, void** kernelParams, void** extra,
+                                   amd::LaunchParams& launch_params, hip::Stream* stream,
+                                   void** kernelParams, void** extra,
                                   hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr,
                                   uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
                                   uint32_t numGrids = 0, uint64_t prevGridSum = 0,
@@ -350,9 +347,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
  amd::Kernel* kernel = function->kernel();

  size_t globalWorkOffset[3] = {0};
-  size_t globalWorkSize[3] = {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ};
-  size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ};
-  amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
+  amd::NDRangeContainer ndrange(3, globalWorkOffset, launch_params.global_.Data(),
+                                launch_params.local_.Data());
  amd::Command::EventWaitList waitList;

  bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr);
@@ -362,9 +358,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
    params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
  }

-  amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
-      *stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
-      allGridSum, firstDevice, profileNDRange);
+  amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList,
+    *kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum,
+    allGridSum, firstDevice, profileNDRange);
  if (!kernelCommand) {
    return hipErrorOutOfMemory;
  }
@@ -414,14 +410,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
  return hipSuccess;
 }

-hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
-                                  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
-                                  uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
-                                  uint32_t sharedMemBytes, hipStream_t hStream, void** kernelParams,
-                                  void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
-                                  uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
-                                  uint32_t numGrids = 0, uint64_t prevGridSum = 0,
-                                  uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
+hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params,
+                                  hipStream_t hStream, void** kernelParams, void** extra,
+                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
+                                  uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+                                  uint64_t prevGridSum = 0, uint64_t allGridSum = 0,
+                                  uint32_t firstDevice = 0) {
  int deviceId = hip::Stream::DeviceId(hStream);
  HIP_RETURN_ONFAIL(PlatformState::instance().initStatManagedVarDevicePtr(deviceId));

@@ -433,32 +427,37 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
  amd::Kernel* kernel = function->kernel();
  amd::ScopedLock lock (DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);

-  hipError_t status = ihipLaunchKernel_validate(
-      f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
-      sharedMemBytes, kernelParams, extra, deviceId, params);
+  hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId,
+                                                params);
  if (status != hipSuccess) {
    return status;
  }
+
  // Make sure the app doesn't launch a workgroup bigger than the global size
-  if (globalWorkSizeX < blockDimX) blockDimX = globalWorkSizeX;
-  if (globalWorkSizeY < blockDimY) blockDimY = globalWorkSizeY;
-  if (globalWorkSizeZ < blockDimZ) blockDimZ = globalWorkSizeZ;
+  if (launch_params.global_[0] < launch_params.local_[0]) {
+    launch_params.local_[0] = launch_params.global_[0];
+  }
+  if (launch_params.global_[1] < launch_params.local_[1]) {
+    launch_params.local_[1] = launch_params.global_[1];
+  }
+  if (launch_params.global_[2] < launch_params.local_[2]) {
+    launch_params.local_[2] = launch_params.global_[2];
+  }

  auto device = g_devices[deviceId]->devices()[0];
  // Check if it's a uniform kernel and validate dimensions
  if (kernel->getDeviceKernel(*device)->getUniformWorkGroupSize()) {
-    if (((globalWorkSizeX % blockDimX) != 0) ||
-        ((globalWorkSizeY % blockDimY) != 0) ||
-        ((globalWorkSizeZ % blockDimZ) != 0)) {
+    if (((launch_params.global_[0] % launch_params.local_[0]) != 0) ||
+        ((launch_params.global_[1] % launch_params.local_[1]) != 0) ||
+        ((launch_params.global_[2] % launch_params.local_[2]) != 0)) {
      return hipErrorInvalidValue;
    }
  }
  amd::Command* command = nullptr;
  hip::Stream* hip_stream = hip::getStream(hStream);
-  status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
-                                   blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream,
-                                   kernelParams, extra, startEvent, stopEvent, flags, params,
-                                   gridId, numGrids, prevGridSum, allGridSum, firstDevice);
+  status = ihipLaunchKernelCommand(command, f, launch_params, hip_stream, kernelParams, extra,
+                                   startEvent, stopEvent, flags, params, gridId, numGrids,
+                                   prevGridSum, allGridSum, firstDevice);
  if (status != hipSuccess) {
    return status;
  }
@@ -514,16 +513,15 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
  if (gridDimX > int32_max || gridDimY > uint16_max || gridDimZ > uint16_max) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
-  size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
-  size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
-  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max()) {
+
+  amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+                                     sharedMemBytes);
+  if (!launch_params.IsValidConfig()) {
    HIP_RETURN(hipErrorInvalidConfiguration);
  }
-  HIP_RETURN(ihipModuleLaunchKernel(
-      f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
-      static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes,
-      hStream, kernelParams, extra, nullptr, nullptr));
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr,
+                                    nullptr));
 }

 hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
@@ -544,9 +542,12 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                 globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
                 kernelParams, extra, startEvent, stopEvent, flags);

-  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
-                                    localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
-                                    hStream, kernelParams, extra, startEvent, stopEvent, flags));
+  amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+                                  localWorkSizeX, localWorkSizeY, localWorkSizeZ,
+                                  sharedMemBytes);
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
+                                    stopEvent, flags));
 }


@@ -559,9 +560,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
               blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra,
               startEvent, stopEvent);

-  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
-                                    blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams,
-                                    extra, startEvent, stopEvent));
+  amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
+                                  blockDimY, blockDimZ, sharedMemBytes);
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
+                                    stopEvent));
 }

 hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX,
@@ -579,19 +582,15 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
  STREAM_CAPTURE(hipModuleLaunchCooperativeKernel, stream, f, gridDimX, gridDimY, gridDimZ,
                 blockDimX, blockDimY, blockDimZ, sharedMemBytes, kernelParams);

-  size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
-  size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
-  size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
-  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+  amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+                                     sharedMemBytes);
+
+  if (!launch_params.IsValidConfig()) {
    HIP_RETURN(hipErrorInvalidConfiguration);
  }
-  HIP_RETURN(ihipModuleLaunchKernel(f, static_cast<uint32_t>(globalWorkSizeX),
-                  static_cast<uint32_t>(globalWorkSizeY),
-                  static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, stream, kernelParams, nullptr, nullptr,
-                  nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
+
+  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, stream, kernelParams, nullptr, nullptr,
+                                    nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
 }

 hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
@@ -682,25 +681,22 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
      }
    }

-    size_t globalWorkSizeX = static_cast<size_t>(launch.gridDimX) * launch.blockDimX;
-    size_t globalWorkSizeY = static_cast<size_t>(launch.gridDimY) * launch.blockDimY;
-    size_t globalWorkSizeZ = static_cast<size_t>(launch.gridDimZ) * launch.blockDimZ;
-    if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
-        globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
-        globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    amd::HIPLaunchParams launch_params(launch.gridDimX, launch.gridDimY, launch.gridDimZ,
+                                       launch.blockDimX, launch.blockDimY, launch.blockDimZ,
+                                       launch.sharedMemBytes);
+
+    if (!launch_params.IsValidConfig()) {
      return hipErrorInvalidConfiguration;
    }
+
    result = ihipModuleLaunchKernel(
-        launch.function, static_cast<uint32_t>(globalWorkSizeX),
-        static_cast<uint32_t>(globalWorkSizeY),
-        static_cast<uint32_t>(globalWorkSizeZ), launch.blockDimX, launch.blockDimY,
-        launch.blockDimZ, launch.sharedMemBytes, launch.hStream, launch.kernelParams,
+        launch.function, launch_params, launch.hStream, launch.kernelParams,
        nullptr, nullptr, nullptr, flags, extFlags,
        i, numDevices, prevGridSize, allGridSize, firstDevice);
    if (result != hipSuccess) {
      break;
    }
-    prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
+    prevGridSize += launch_params.global_.product();
  }

  // Sync the execution streams on all devices
@@ -810,13 +806,12 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
    return getStatFuncError;
  }
  const amd::Device* device = g_devices[deviceId]->devices()[0];
-  size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
-  size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
-  size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
-  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeZ > std::numeric_limits<uint32_t>::max() ||
-      (blockDim.x * blockDim.y * blockDim.z > device->info().maxWorkGroupSize_)) {
+
+  amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
+                                     blockDim.z, sharedMemBytes);
+
+  if (!launch_params.IsValidConfig() ||
+      launch_params.local_.product() > device->info().maxWorkGroupSize_) {
    return hipErrorInvalidConfiguration;
  }

@@ -824,15 +819,14 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
    return hipErrorCooperativeLaunchTooLarge;
  }

-  if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
+  //if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
+      || launch_params.global_[2] == 0) {
    return hipErrorInvalidConfiguration;
  }

-  return ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
-                                static_cast<uint32_t>(globalWorkSizeY),
-                                static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y,
-                                blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr,
-                                nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
+  return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr,
+                                nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
 }

 hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
@@ -1132,35 +1126,31 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f
    HIP_RETURN(hipErrorInvalidValue);
  }

-  size_t globalWorkSizeX = static_cast<size_t>(config->gridDimX) * config->blockDimX;
-  size_t globalWorkSizeY = static_cast<size_t>(config->gridDimY) * config->blockDimY;
-  size_t globalWorkSizeZ = static_cast<size_t>(config->gridDimZ) * config->blockDimZ;
-  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+  amd::HIPLaunchParams launch_params(config->gridDimX, config->gridDimY, config->gridDimZ,
+                                     config->blockDimX, config->blockDimY, config->blockDimZ,
+                                     config->sharedMemBytes);
+
+  if (!launch_params.IsValidConfig()) {
    HIP_RETURN(hipErrorInvalidConfiguration);
  }

  if (config->numAttrs == 0) {
-    HIP_RETURN(ihipModuleLaunchKernel(
-      f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
-      static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
-      config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
-      nullptr, nullptr, 0));
+    HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams, nullptr,
+                                      nullptr, nullptr, 0));
  }

  for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
    hipLaunchAttribute& attr = config->attrs[attr_idx];
    switch (attr.id) {
      case hipLaunchAttributeCooperative:
+      {
        if (attr.value.cooperative != 0) {
-          HIP_RETURN(ihipModuleLaunchKernel(
-              f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
-              static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
-              config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
-              nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
+          HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams,
+                                            nullptr, nullptr, nullptr, 0,
+                                            amd::NDRangeKernelCommand::CooperativeGroups));
        }
        break;
+      }
      default:
        LogPrintfError("Attribute %u not supported", attr.id);
        break;
@@ -50,11 +50,10 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
                                  hipDeviceptr_t* dptr, size_t* bytes);

 extern hipError_t ihipModuleLaunchKernel(
-    hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
-    uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream,
-    void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
-    uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
-    uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
+    hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams,
+    void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
+    uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0,
+    uint64_t allGridSum = 0, uint32_t firstDevice = 0);
 static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
  // Workaround for device name mismatch.
  // Device name may contain feature strings delimited by '+', e.g.
@@ -641,18 +640,14 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi
    return hipErrorInvalidConfiguration;
  }

-  size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
-  size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
-  size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
-  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
-      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+  amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
+                                     blockDim.z, sharedMemBytes);
+  if (!launch_params.IsValidConfig()) {
    return hipErrorInvalidConfiguration;
  }
-  return ihipModuleLaunchKernel(
-      func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
-      static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
-      stream, args, nullptr, startEvent, stopEvent, flags);
+
+  return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr,
+                                startEvent, stopEvent, flags);
 }

 // conversion routines between float and half precision
@@ -56,6 +56,10 @@ class NDRange : public EmbeddedObject {
  //! Construct a new index space of the given dimensions.
  explicit NDRange(size_t dimensions);

+  NDRange(size_t dataX, size_t dataY, size_t dataZ) : dimensions_(3) {
+    data_[0] = dataX; data_[1] = dataY; data_[2] = dataZ;
+  }
+
  //! Copy constructor.
  NDRange(const NDRange& space);

@@ -112,6 +116,40 @@ class NDRange : public EmbeddedObject {
  //! Print this index space on the given stream.
  void printOn(FILE* file) const;
 #endif  // DEBUG
+
+  const size_t* Data() const { return data_; }
+};
+
+//! Stucture to store launch parameters.
+struct LaunchParams {
+  NDRange global_;           //!< Total number of work-items in N-dims
+  NDRange local_;            //!< Number of work-items in N-dims in a workgroup.
+  uint32_t sharedMemBytes_;  //!< Shared Memory bytes
+  bool validConfig_;         //!< Flag will be set to false when config is not correct.
+
+  LaunchParams(uint32_t globalX, uint32_t globalY, uint32_t globalZ, uint32_t localX,
+               uint32_t localY, uint32_t localZ, uint32_t sharedMemBytes)
+               : global_(globalX, globalY, globalZ), local_(localX, localY, localZ),
+                 sharedMemBytes_ (sharedMemBytes), validConfig_(true) {}
+
+  bool IsValidConfig() const { return validConfig_; }
+};
+
+//! Structure to store launch parameters in HIP Style (global and local size needs computation).
+struct HIPLaunchParams : public LaunchParams {
+ public:
+  HIPLaunchParams(uint32_t gridX, uint32_t gridY, uint32_t gridZ, uint32_t blockX,
+                  uint32_t blockY, uint32_t blockZ, uint32_t sharedMemBytes)
+                  : LaunchParams(static_cast<uint32_t>(gridX) * blockX,
+                                 static_cast<uint32_t>(gridY) * blockY,
+                                 static_cast<uint32_t>(gridZ) * blockZ,
+                                 blockX, blockY, blockZ, sharedMemBytes) {
+    if (global_[0] > std::numeric_limits<uint32_t>::max() ||
+        global_[1] > std::numeric_limits<uint32_t>::max() ||
+        global_[2] > std::numeric_limits<uint32_t>::max()) {
+      validConfig_ = false;
+    }
+  }
 };

 //! A container for the local and global worksizes.