From 5ee211e801f1e20eb1e972213fb2d98c8bb47b03 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Wed, 29 Aug 2018 12:35:08 -0400
Subject: [PATCH] P4 to Git Change 1599472 by gandryey@gera-w8 on 2018/08/29
 12:25:34

	SWDEV-79445 - OCL generic changes and code clean-up
	- Move FindLocalWorkSize() logic to the abstraction layer
	- Replace the ROCr path with the common FindLocalWorkSize() functionality

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#227 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#314 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#330 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#67 edit
---
 rocclr/runtime/device/device.cpp           |  31 ++++
 rocclr/runtime/device/device.hpp           |   1 +
 rocclr/runtime/device/devkernel.cpp        | 179 ++++++++++++++++++---
 rocclr/runtime/device/devkernel.hpp        |  19 ++-
 rocclr/runtime/device/gpu/gpukernel.cpp    | 103 +-----------
 rocclr/runtime/device/gpu/gpukernel.hpp    |   7 -
 rocclr/runtime/device/pal/palkernel.cpp    | 120 +-------------
 rocclr/runtime/device/pal/palkernel.hpp    |   7 -
 rocclr/runtime/device/rocm/rockernel.cpp   |   2 +-
 rocclr/runtime/device/rocm/rocsettings.cpp |  11 +-
 rocclr/runtime/device/rocm/rocsettings.hpp |   3 +-
 rocclr/runtime/device/rocm/rocvirtual.cpp  | 141 +---------------
 12 files changed, 216 insertions(+), 408 deletions(-)

diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp
index acb52b0142..edacb8557c 100644
--- a/rocclr/runtime/device/device.cpp
+++ b/rocclr/runtime/device/device.cpp
@@ -585,6 +585,37 @@ Settings::Settings() {
                          //!< concurrent Virtual GPUs for default
 }
 
+void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
+  const amd::Coord3D region, uint mapFlags, bool entire,
+  amd::Image* baseMip) {
+  // Map/Unmap must be serialized.
+  amd::ScopedLock lock(owner()->lockMemoryOps());
+
+  WriteMapInfo info = {};
+  WriteMapInfo* pInfo = &info;
+  auto it = writeMapInfo_.find(mapAddress);
+  if (it != writeMapInfo_.end()) {
+    LogWarning("Double map of the same or overlapped region!");
+    pInfo = &it->second;
+  }
+
+  if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
+    pInfo->origin_ = origin;
+    pInfo->region_ = region;
+    pInfo->entire_ = entire;
+    pInfo->unmapWrite_ = true;
+  }
+  if (mapFlags & CL_MAP_READ) {
+    pInfo->unmapRead_ = true;
+  }
+  pInfo->baseMip_ = baseMip;
+
+  // Insert into the map if it's the first region
+  if (++pInfo->count_ == 1) {
+    writeMapInfo_.insert({ mapAddress, info });
+  }
+}
+
 Program::Program(amd::Device& device)
     : device_(device),
       type_(TYPE_NONE),
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index 5a30609600..ec89e63f5b 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -517,6 +517,7 @@ class Settings : public amd::HeapObject {
 
   uint commandQueues_;  //!< Field value for maximum number
                         //!< concurrent Virtual GPUs for each backend
+
   //! Default constructor
   Settings();
 
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index e1cfc0c42a..9c4b43f960 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -3,6 +3,7 @@
 //
 #include "platform/runtime.hpp"
 #include "platform/program.hpp"
+#include "platform/ndrange.hpp"
 #include "devkernel.hpp"
 #include "utils/macros.hpp"
 #include "utils/options.hpp"
@@ -22,6 +23,7 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
 
 namespace device {
 
+ // ================================================================================================
 bool Kernel::createSignature(
   const parameters_t& params, uint32_t numParameters,
   uint32_t version) {
@@ -63,45 +65,139 @@ bool Kernel::createSignature(
   return false;
 }
 
+// ================================================================================================
 Kernel::~Kernel() { delete signature_; }
 
+// ================================================================================================
 std::string Kernel::openclMangledName(const std::string& name) {
   const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
   assert(bifSym && "symbol not found");
   return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
 }
 
-void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
-  const amd::Coord3D region, uint mapFlags, bool entire,
-  amd::Image* baseMip) {
-  // Map/Unmap must be serialized.
-  amd::ScopedLock lock(owner()->lockMemoryOps());
+// ================================================================================================
+void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
+  amd::NDRange& lclWorkSize) const {
+  // Initialize the default workgoup info
+  // Check if the kernel has the compiled sizes
+  if (workGroupInfo()->compileSize_[0] == 0) {
+    // Find the default local workgroup size, if it wasn't specified
+    if (lclWorkSize[0] == 0) {
+      bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
+      bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
+        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
+      bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
+        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
+        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
 
-  WriteMapInfo info = {};
-  WriteMapInfo* pInfo = &info;
-  auto it = writeMapInfo_.find(mapAddress);
-  if (it != writeMapInfo_.end()) {
-    LogWarning("Double map of the same or overlapped region!");
-    pInfo = &it->second;
-  }
+      bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
+        ((workDim == 3) && b3DOverrideSet);
+      if (!overrideSet) {
+        // Find threads per group
+        size_t thrPerGrp = workGroupInfo()->size_;
 
-  if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
-    pInfo->origin_ = origin;
-    pInfo->region_ = region;
-    pInfo->entire_ = entire;
-    pInfo->unmapWrite_ = true;
-  }
-  if (mapFlags & CL_MAP_READ) {
-    pInfo->unmapRead_ = true;
-  }
-  pInfo->baseMip_ = baseMip;
+        // Check if kernel uses images
+        if (flags_.imageEna_ &&
+          // and thread group is a multiple value of wavefronts
+          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+          // and it's 2 or 3-dimensional workload
+          (workDim > 1) && ((dev().settings().partialDispatch_) ||
+          (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
+          // Use 8x8 workgroup size if kernel has image writes
+          if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
+            lclWorkSize[0] = 8;
+            lclWorkSize[1] = 8;
+          }
+          else {
+            lclWorkSize[0] = 16;
+            lclWorkSize[1] = 16;
+          }
+          if (workDim == 3) {
+            lclWorkSize[2] = 1;
+          }
+        }
+        else {
+          size_t tmp = thrPerGrp;
+          // Split the local workgroup into the most efficient way
+          for (uint d = 0; d < workDim; ++d) {
+            size_t div = tmp;
+            for (; (gblWorkSize[d] % div) != 0; div--)
+              ;
+            lclWorkSize[d] = div;
+            tmp /= div;
+          }
 
-  // Insert into the map if it's the first region
-  if (++pInfo->count_ == 1) {
-    writeMapInfo_.insert({ mapAddress, info });
+          // Assuming DWORD access
+          const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2;
+
+          // Check if partial dispatch is enabled and
+          if (dev().settings().partialDispatch_ &&
+            // we couldn't find optimal workload
+            (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+              // or size is too small for the cache line
+            (lclWorkSize[0] < cacheLineMatch))) {
+            size_t maxSize = 0;
+            size_t maxDim = 0;
+            for (uint d = 0; d < workDim; ++d) {
+              if (maxSize < gblWorkSize[d]) {
+                maxSize = gblWorkSize[d];
+                maxDim = d;
+              }
+            }
+            // Use X dimension as high priority. Runtime will assume that
+            // X dimension is more important for the address calculation
+            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+              lclWorkSize[0] = cacheLineMatch;
+              thrPerGrp /= cacheLineMatch;
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 1; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+            else {
+              // Check if a local workgroup has the most optimal size
+              if (thrPerGrp > maxSize) {
+                thrPerGrp = maxSize;
+              }
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 0; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+          }
+        }
+      }
+      else {
+        // Use overrides when app doesn't provide workgroup dimensions
+        if (workDim == 1) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
+        }
+        else if (workDim == 2) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+        }
+        else if (workDim == 3) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+        }
+        else {
+          assert(0 && "Invalid workDim!");
+        }
+      }
+    }
+  }
+  else {
+    for (uint d = 0; d < workDim; ++d) {
+      lclWorkSize[d] = workGroupInfo()->compileSize_[d];
+    }
   }
 }
-
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 using llvm::AMDGPU::HSAMD::AccessQualifier;
 using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
@@ -145,6 +241,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
   }
 }
 #endif
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
   if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
@@ -189,6 +286,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
 }
 #endif
 
+// ================================================================================================
 static const clk_value_type_t ClkValueMapType[6][6] = {
   { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
   { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
@@ -198,6 +296,7 @@ static const clk_value_type_t ClkValueMapType[6][6] = {
   { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
 };
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
   uint sizeType;
@@ -274,6 +373,7 @@ static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t si
   return T_VOID;
 }
 #endif
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
   uint sizeType;
@@ -351,9 +451,12 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
   switch (argInfo->type) {
@@ -392,6 +495,7 @@ static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
   if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -405,6 +509,8 @@ static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
   return 1;
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
   if (argInfo->type == ARG_TYPE_POINTER) {
@@ -414,6 +520,7 @@ static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
   if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
@@ -429,6 +536,8 @@ static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
   return false;
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
   if (argInfo->type == ARG_TYPE_POINTER) {
@@ -441,9 +550,12 @@ static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 inline static int GetArgSizeOCL(const aclArgData* argInfo) {
   switch (argInfo->type) {
@@ -481,6 +593,7 @@ inline static int GetArgSizeOCL(const aclArgData* argInfo) {
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
   if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -504,6 +617,8 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgM
   return CL_KERNEL_ARG_ADDRESS_PRIVATE;
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
   if (argInfo->type == ARG_TYPE_POINTER) {
@@ -534,6 +649,7 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
   if (lcArg.mValueKind == ValueKind::Image) {
@@ -550,6 +666,8 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArg
   return CL_KERNEL_ARG_ACCESS_NONE;
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
   if (argInfo->type == ARG_TYPE_IMAGE) {
@@ -566,6 +684,7 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgDat
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
   cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -588,6 +707,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD&
   return rv;
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
   cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -618,6 +739,7 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
   // Iterate through the arguments and insert into parameterList
@@ -689,6 +811,8 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
   createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
 }
 #endif
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
   // Iterate through the arguments and insert into parameterList
@@ -769,6 +893,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
 }
 #endif
 
+// ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER)
 void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
   for (auto str : printfInfoStrings) {
@@ -860,6 +985,8 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
   }
 }
 #endif  // defined(WITH_LIGHTNING_COMPILER)
+
+// ================================================================================================
 #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
 void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) {
   PrintfInfo info;
diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp
index 9140ae28d7..94200a92b0 100644
--- a/rocclr/runtime/device/devkernel.hpp
+++ b/rocclr/runtime/device/devkernel.hpp
@@ -35,6 +35,7 @@ namespace amd {
 
 class Device;
 class KernelSignature;
+class NDRange;
 
 struct ProfilingCallback : public amd::HeapObject {
   virtual void callback(ulong duration, uint32_t waves) = 0;
@@ -123,7 +124,10 @@ class Kernel : public amd::HeapObject {
   };
 
   //! Default constructor
-  Kernel(const std::string& name) : name_(name), signature_(NULL) {
+  Kernel(const amd::Device& dev, const std::string& name)
+    : dev_(dev)
+    , name_(name)
+    , signature_(nullptr) {
     // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
     // Due to std::string not being able to be memset to 0
     workGroupInfo_.size_ = 0;
@@ -193,13 +197,16 @@ class Kernel : public amd::HeapObject {
 
   //! Get profiling callback object
   virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
-    return NULL;
+    return nullptr;
   }
 
   virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
       return 0;
   }
 
+  //! Returns GPU device object, associated with this kernel
+  const amd::Device& dev() const { return dev_; }
+
   void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
 
   void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
@@ -237,6 +244,13 @@ class Kernel : public amd::HeapObject {
   //! Return printf info array
   const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
 
+  //! Finds local workgroup size
+  void FindLocalWorkSize(
+    size_t workDim,                   //!< Work dimension
+    const amd::NDRange& gblWorkSize,  //!< Global work size
+    amd::NDRange& lclWorkSize         //!< Calculated local work size
+  ) const;
+
  protected:
   //! Initializes the abstraction layer kernel parameters
 #if defined(WITH_LIGHTNING_COMPILER)
@@ -252,6 +266,7 @@ class Kernel : public amd::HeapObject {
   //! Initializes HSAIL Printf metadata and info
   void InitPrintf(const aclPrintfFmt* aclPrintf);
 #endif
+  const amd::Device& dev_;          //!< GPU device object
   std::string name_;                //!< kernel name
   WorkGroupInfo workGroupInfo_;     //!< device kernel info structure
   amd::KernelSignature* signature_; //!< kernel signature
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 26879b9639..fe88dbd499 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -515,7 +515,7 @@ clk_value_type_t KernelArg::type() const {
 
 NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
                        const NullProgram& nullprog)
-    : device::Kernel(name),
+    : device::Kernel(gpuNullDev, name),
       buildError_(CL_BUILD_PROGRAM_FAILURE),
       gpuDev_(gpuNullDev),
       prog_(nullprog),
@@ -3047,9 +3047,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
 
 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
                          uint extraArgsNum)
-    : device::Kernel(name),
+    : device::Kernel(prog->dev(), name),
       compileOptions_(compileOptions),
-      dev_(prog->dev()),
       prog_(*prog),
       index_(0),
       code_(NULL),
@@ -3241,102 +3240,6 @@ const HSAILProgram& HSAILKernel::prog() const {
   return reinterpret_cast<const HSAILProgram&>(prog_);
 }
 
-void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
-                                    amd::NDRange& lclWorkSize) const {
-  // Initialize the default workgoup info
-  // Check if the kernel has the compiled sizes
-  if (workGroupInfo()->compileSize_[0] == 0) {
-    // Find the default local workgroup size, if it wasn't specified
-    if (lclWorkSize[0] == 0) {
-      bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
-      bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
-      bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
-      bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
-          ((workDim == 3) && b3DOverrideSet);
-      if (!overrideSet) {
-        // Find threads per group
-        size_t thrPerGrp = workGroupInfo()->size_;
-
-        // Check if kernel uses images
-        if (flags_.imageEna_ &&
-            // and thread group is a multiple value of wavefronts
-            ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-            // and it's 2 or 3-dimensional workload
-            (workDim > 1) && ((dev().settings().partialDispatch_) ||
-                              (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
-          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
-            lclWorkSize[0] = 8;
-            lclWorkSize[1] = 8;
-          } else {
-            lclWorkSize[0] = 16;
-            lclWorkSize[1] = 16;
-          }
-          if (workDim == 3) {
-            lclWorkSize[2] = 1;
-          }
-        } else {
-          size_t tmp = thrPerGrp;
-          // Split the local workgroup into the most efficient way
-          for (uint d = 0; d < workDim; ++d) {
-            size_t div = tmp;
-            for (; (gblWorkSize[d] % div) != 0; div--)
-              ;
-            lclWorkSize[d] = div;
-            tmp /= div;
-          }
-
-          // Check if partial dispatch is enabled and
-          if (dev().settings().partialDispatch_ &&
-              // we couldn't find optimal workload
-              (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
-            size_t maxSize = 0;
-            size_t maxDim = 0;
-            for (uint d = 0; d < workDim; ++d) {
-              if (maxSize < gblWorkSize[d]) {
-                maxSize = gblWorkSize[d];
-                maxDim = d;
-              }
-            }
-            // Check if a local workgroup has the most optimal size
-            if (thrPerGrp > maxSize) {
-              thrPerGrp = maxSize;
-            }
-            lclWorkSize[maxDim] = thrPerGrp;
-            for (uint d = 0; d < workDim; ++d) {
-              if (d != maxDim) {
-                lclWorkSize[d] = 1;
-              }
-            }
-          }
-        }
-      } else {
-        // Use overrides when app doesn't provide workgroup dimensions
-        if (workDim == 1) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
-        } else if (workDim == 2) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-        } else if (workDim == 3) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-        } else {
-          assert(0 && "Invalid workDim!");
-        }
-      }
-    }
-  } else {
-    for (uint d = 0; d < workDim; ++d) {
-      lclWorkSize[d] = workGroupInfo()->compileSize_[d];
-    }
-  }
-}
-
 inline static void WriteAqlArg(
     unsigned char** dst,  //!< The write pointer to the buffer
     const void* src,      //!< The source pointer
@@ -3576,7 +3479,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
   const amd::NDRange& global = sizes.global();
 
   // Check if runtime has to find local workgroup size
-  findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+  FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
 
   hsaDisp->header = kDispatchPacketHeader;
   hsaDisp->setup = sizes.dimensions();
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index d1543ee496..544cc9e9e7 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel {
   //! Returns spill reg size per workitem
   int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; }
 
-  //! Finds local workgroup size
-  void findLocalWorkSize(size_t workDim,                   //!< Work dimension
-                         const amd::NDRange& gblWorkSize,  //!< Global work size
-                         amd::NDRange& lclWorkSize         //!< Local work size
-                         ) const;
-
   //! Returns AQL packet in CPU memory
   //! if the kerenl arguments were successfully loaded, otherwise NULL
   hsa_kernel_dispatch_packet_t* loadArguments(
@@ -870,7 +864,6 @@ class HSAILKernel : public device::Kernel {
   std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
   std::string compileOptions_;        //!< compile used for finalizing this kernel
   amd_kernel_code_t* cpuAqlCode_;     //!< AQL kernel code on CPU
-  const NullDevice& dev_;             //!< GPU device object
   const HSAILProgram& prog_;          //!< Reference to the parent program
   uint index_;                        //!< Kernel index in the program
 
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index a646e0c3e5..7c330fbbd4 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -69,9 +69,8 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
 }
 
 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
-    : device::Kernel(name),
+    : device::Kernel(prog->dev(), name),
       compileOptions_(compileOptions),
-      dev_(prog->dev()),
       prog_(*prog),
       index_(0),
       code_(0),
@@ -253,121 +252,6 @@ const HSAILProgram& HSAILKernel::prog() const {
   return reinterpret_cast<const HSAILProgram&>(prog_);
 }
 
-void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
-                                    amd::NDRange& lclWorkSize) const {
-  // Initialize the default workgoup info
-  // Check if the kernel has the compiled sizes
-  if (workGroupInfo()->compileSize_[0] == 0) {
-    // Find the default local workgroup size, if it wasn't specified
-    if (lclWorkSize[0] == 0) {
-      bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
-      bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
-      bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-          !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
-      bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
-          ((workDim == 3) && b3DOverrideSet);
-      if (!overrideSet) {
-        // Find threads per group
-        size_t thrPerGrp = workGroupInfo()->size_;
-
-        // Check if kernel uses images
-        if (flags_.imageEna_ &&
-            // and thread group is a multiple value of wavefronts
-            ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-            // and it's 2 or 3-dimensional workload
-            (workDim > 1) && ((dev().settings().partialDispatch_) ||
-                              (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
-          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
-            lclWorkSize[0] = 8;
-            lclWorkSize[1] = 8;
-          } else {
-            lclWorkSize[0] = 16;
-            lclWorkSize[1] = 16;
-          }
-          if (workDim == 3) {
-            lclWorkSize[2] = 1;
-          }
-        } else {
-          size_t tmp = thrPerGrp;
-          // Split the local workgroup into the most efficient way
-          for (uint d = 0; d < workDim; ++d) {
-            size_t div = tmp;
-            for (; (gblWorkSize[d] % div) != 0; div--)
-              ;
-            lclWorkSize[d] = div;
-            tmp /= div;
-          }
-
-          // Assuming DWORD access
-          const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
-
-          // Check if partial dispatch is enabled and
-          if (dev().settings().partialDispatch_ &&
-            // we couldn't find optimal workload
-            (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-                // or size is too small for the cache line
-            (lclWorkSize[0] < cacheLineMatch))) {
-            size_t maxSize = 0;
-            size_t maxDim = 0;
-            for (uint d = 0; d < workDim; ++d) {
-              if (maxSize < gblWorkSize[d]) {
-                maxSize = gblWorkSize[d];
-                maxDim = d;
-              }
-            }
-            // Use X dimension as high priority. Runtime will assume that
-            // X dimension is more important for the address calculation
-            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-              lclWorkSize[0] = cacheLineMatch;
-              thrPerGrp /= cacheLineMatch;
-              lclWorkSize[maxDim] = thrPerGrp;
-              for (uint d = 1; d < workDim; ++d) {
-                if (d != maxDim) {
-                  lclWorkSize[d] = 1;
-                }
-              }
-            }
-            else {
-              // Check if a local workgroup has the most optimal size
-              if (thrPerGrp > maxSize) {
-                thrPerGrp = maxSize;
-              }
-              lclWorkSize[maxDim] = thrPerGrp;
-              for (uint d = 0; d < workDim; ++d) {
-                if (d != maxDim) {
-                  lclWorkSize[d] = 1;
-                }
-              }
-            }
-          }
-        }
-      } else {
-        // Use overrides when app doesn't provide workgroup dimensions
-        if (workDim == 1) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
-        } else if (workDim == 2) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-        } else if (workDim == 3) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-        } else {
-          assert(0 && "Invalid workDim!");
-        }
-      }
-    }
-  } else {
-    for (uint d = 0; d < workDim; ++d) {
-      lclWorkSize[d] = workGroupInfo()->compileSize_[d];
-    }
-  }
-}
-
 hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
     VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
     const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
@@ -450,7 +334,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
   const amd::NDRange& global = sizes.global();
 
   // Check if runtime has to find local workgroup size
-  findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+  FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
 
   constexpr uint16_t kDispatchPacketHeader =
     (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp
index 525bc517e3..43fc6ff185 100644
--- a/rocclr/runtime/device/pal/palkernel.hpp
+++ b/rocclr/runtime/device/pal/palkernel.hpp
@@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel {
   //! Returns spill reg size per workitem
   int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); }
 
-  //! Finds local workgroup size
-  void findLocalWorkSize(size_t workDim,                   //!< Work dimension
-                         const amd::NDRange& gblWorkSize,  //!< Global work size
-                         amd::NDRange& lclWorkSize         //!< Local work size
-                         ) const;
-
   //! Returns AQL packet in CPU memory
   //! if the kernel arguments were successfully loaded, otherwise NULL
   hsa_kernel_dispatch_packet_t* loadArguments(
@@ -127,7 +121,6 @@ class HSAILKernel : public device::Kernel {
 
   std::string compileOptions_;        //!< compile used for finalizing this kernel
   amd_kernel_code_t* cpuAqlCode_;     //!< AQL kernel code on CPU
-  const NullDevice& dev_;             //!< GPU device object
   const HSAILProgram& prog_;          //!< Reference to the parent program
   uint index_;                        //!< Kernel index in the program
 
diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp
index 227758da2c..a63c9f9767 100644
--- a/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/rocclr/runtime/device/rocm/rockernel.cpp
@@ -15,7 +15,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
                const uint32_t workgroupGroupSegmentByteSize,
                const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
                const uint32_t kernargSegmentAlignment)
-    : device::Kernel(name),
+    : device::Kernel(prog->dev(), name),
       program_(prog),
       kernelCodeHandle_(kernelCodeHandle),
       workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp
index 039016781d..31e235eeb1 100644
--- a/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -43,16 +43,7 @@ Settings::Settings() {
   nonCoherentMode = getenv("OPENCL_USE_NC_MEMORY_POLICY");
   enableNCMode_ = (nonCoherentMode) ? true : false;
 
-  // Determine if user wishes to disable support for
-  // partial dispatch. By default support for partial
-  // dispatch is enabled. Users can turn it off for
-  // devices that do not support this feature.
-  //
-  // @note Update appropriate field of device::Settings
-  char* partialDispatch = nullptr;
-  partialDispatch = getenv("OPENCL_DISABLE_PARTIAL_DISPATCH");
-  enablePartialDispatch_ = (partialDispatch) ? false : true;
-  partialDispatch_ = (partialDispatch) ? false : true;
+  partialDispatch_ = GPU_PARTIAL_DISPATCH;
   commandQueues_ = 100;  //!< Field value set to maximum number
                          //!< concurrent Virtual GPUs for ROCm backend
 
diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp
index d7d6dd11ef..4462907694 100644
--- a/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -24,13 +24,12 @@ class Settings : public device::Settings {
       uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
       uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
-      uint enablePartialDispatch_ : 1;  //!< Enable support for Partial Dispatch
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
       uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
       uint stagedXferWrite_ : 1;        //!< Uses a staged buffer write
       uint singleFpDenorm_ : 1;         //!< Support Single FP Denorm
       uint apuSystem_ : 1;              //!< APU system
-      uint reserved_ : 21;
+      uint reserved_ : 22;
     };
     uint value_;
   };
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index f9dae19ac1..4af1354d6e 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1764,132 +1764,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
   profilingEnd(vcmd);
 }
 
-// Over rides the workgroup size fields in the packet with runtime/compiler set sizes
-void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
-                                 amd::NDRangeContainer sizes, device::Kernel* devKernel,
-                                 const roc::Device& dev) {
-
-  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
-  const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
-
-  // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
-  // else this error check is incorrect
-  if (compile_size[0] || compile_size[1] || compile_size[2]) {
-    dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? compile_size[0] : 1;
-    dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? compile_size[1] : 1;
-    dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? compile_size[2] : 1;
-  } else {
-    size_t thrPerGrp;
-    bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
-    bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
-        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
-    bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
-        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-        !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
-    bool overrideSet = ((sizes.dimensions() == 1) && b1DOverrideSet) ||
-                       ((sizes.dimensions() == 2) && b2DOverrideSet) ||
-                       ((sizes.dimensions() == 3) && b3DOverrideSet);
-    if (!overrideSet) {
-      // Find threads per group
-      thrPerGrp = devKernel->workGroupInfo()->size_;
-
-      if (gpuKernel.imageEnable() &&
-          // and thread group is a multiple value of wavefronts
-          ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
-          // and it's 2 or 3-dimensional workload
-          (sizes.dimensions() > 1) &&
-          ((dev.settings().partialDispatch_) ||
-           (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
-          // Use 8x8 workgroup size if kernel has image writes)
-        if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
-          sizes.local()[0] = 8;
-          sizes.local()[1] = 8;
-        }
-        else {
-          sizes.local()[0] = 16;
-          sizes.local()[1] = 16;
-        }
-        if (sizes.dimensions() == 3)  {
-          sizes.local()[2] = 1;
-        }
-      }
-      else {
-        size_t tmp = thrPerGrp;
-        // Split the local workgroup into the most efficient way
-        for (uint d = 0; d < sizes.dimensions(); ++d) {
-            size_t div = tmp;
-            for (; (sizes.global()[d] % div) != 0; div--)
-              ;
-            sizes.local()[d] = div;
-            tmp /= div;
-        }
-
-        // Assuming DWORD access
-        const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
-
-        // Check if partial dispatch is enabled and
-        if (dev.settings().partialDispatch_ &&
-            // we couldn't find optimal workload
-            ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
-                  // or size is too small for the cache line
-             (sizes.local()[0] < cacheLineMatch))) {
-          size_t maxSize = 0;
-          size_t maxDim = 0;
-          for (uint d = 0; d < sizes.dimensions(); ++d) {
-            if (maxSize < sizes.global()[d]) {
-              maxSize = sizes.global()[d];
-              maxDim = d;
-            }
-          }
-
-          if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
-            sizes.local()[0] = cacheLineMatch;
-            thrPerGrp /= cacheLineMatch;
-            sizes.local()[maxDim] = thrPerGrp;
-            for (uint d = 1; d < sizes.dimensions(); ++d) {
-              if (d != maxDim) {
-                sizes.local()[d] = 1;
-              }
-            }
-          }
-          else {
-            // Check if a local workgroup has the most optimal size
-            if (thrPerGrp > maxSize) {
-              thrPerGrp = maxSize;
-            }
-            sizes.local()[maxDim] = thrPerGrp;
-            for (uint d = 0; d < sizes.dimensions(); ++d) {
-              if (d != maxDim) {
-                sizes.local()[d] = 1;
-              }
-            }
-          }
-        }
-      }
-      dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1;
-      dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1;
-      dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1;
-    } else {
-      // Runtime must set the group size
-      dispatchPacket.workgroup_size_x = 1;
-      dispatchPacket.workgroup_size_y = 1;
-      dispatchPacket.workgroup_size_z = 1;
-
-      if (sizes.dimensions() == 1) {
-        dispatchPacket.workgroup_size_x = dev.settings().preferredWorkGroupSize_;
-      } else if (sizes.dimensions() == 2) {
-        dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_;
-        dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_;
-      } else if (sizes.dimensions() == 3) {
-        dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_;
-        dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_;
-        dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_;
-      }
-    }
-  }
-}
-
 bool VirtualGPU::createSchedulerParam()
 {
   if (nullptr != schedulerParam_) {
@@ -2235,15 +2109,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
     dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1;
 
-    if (sizes.local().product() != 0) {
-      dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1;
-      dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1;
-      dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1;
-    } else {
-      amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
-                                     &(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
-      setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, devKernel, dev());
-    }
+    amd::NDRange local(sizes.local());
+    devKernel->FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+    dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? local[0] : 1;
+    dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? local[1] : 1;
+    dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
+
     dispatchPacket.kernarg_address = argBuffer;
     dispatchPacket.group_segment_size = ldsUsage;
     dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;