From 2fd27167df9c67ef971804b056099be5cc2815ea Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Fri, 19 Jan 2018 13:33:55 -0500
Subject: [PATCH] P4 to Git Change 1505728 by wchau@wchau_OCL_boltzmann on
 2018/01/19 13:13:53

	SWDEV-137270 - Add findLocalWorkSize to OpenCL runtime ROC device
	- Add support for kernel using image argument
	- fix the issue of not using the max workgroup size specified by the environment variables "maxWorkGroupSize*"

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#33 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#50 edit


[ROCm/clr commit: 87ad3f9692fc1570a3a9c4a4d789f7b12e9329c8]
---
 .../rocclr/runtime/device/rocm/rockernel.cpp  |  18 +++
 .../rocclr/runtime/device/rocm/rockernel.hpp  |   8 ++
 .../runtime/device/rocm/rocsettings.cpp       |  17 +++
 .../rocclr/runtime/device/rocm/rocvirtual.cpp | 111 +++++++++++-------
 4 files changed, 110 insertions(+), 44 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index 01aa905f6f..99573a8da2 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
     desc.typeQualifier_ = GetOclTypeQual(aclArg);
     desc.typeName_ = arg->typeName_.c_str();
 
+    // set image related flags
+    if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+      flags_.imageEnable_ = true;
+      if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+          desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+        flags_.imageWrite_ = true;
+      }
+    }
+
     // Make a check if it is local or global
     if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
       desc.size_ = 0;
@@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
     desc.typeQualifier_ = GetOclTypeQual(lcArg);
     desc.typeName_ = lcArg.mTypeName.c_str();
 
+    // set image related flags
+    if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+      flags_.imageEnable_ = true;
+      if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+          desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+        flags_.imageWrite_ = true;
+      }
+    }
+
     // Make a check if it is local or global
     if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
       desc.size_ = 0;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
index 782ca66c4e..926b304d00 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
@@ -127,10 +127,18 @@ class Kernel : public device::Kernel {
   //! set internal kernel flag
   void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
 
+  //! Return TRUE if kernel uses images
+  bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
+
+  //! Return TRUE if kernel wirtes images
+  bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
+
  protected:
   union Flags {
     struct {
       uint internalKernel_ : 1;  //!< Is a blit kernel?
+      uint imageEnable_    : 1;  //!< Kernel uses images
+      uint imageWrite_     : 1;  //!< Kernel writes images
     };
     uint value_;
     Flags() : value_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
index 72423a67dc..442871edde 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -142,6 +142,23 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
+  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
+    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
+    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+  }
+
+  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
+    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
+    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
+    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+  }
+
   if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
     commandQueues_ = GPU_MAX_COMMAND_QUEUES;
   }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 947eaa4c27..70dbf3bf8a 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
                                  amd::NDRangeContainer sizes, device::Kernel* devKernel,
                                  const roc::Device& dev) {
 
+  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
 
   // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
@@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
       // Find threads per group
       thrPerGrp = devKernel->workGroupInfo()->size_;
 
-      size_t tmp = thrPerGrp;
-      // Split the local workgroup into the most efficient way
-      for (uint d = 0; d < sizes.dimensions(); ++d) {
-          size_t div = tmp;
-          for (; (sizes.global()[d] % div) != 0; div--)
-            ;
-          sizes.local()[d] = div;
-          tmp /= div;
-      }
-
-      // Assuming DWORD access
-      const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
-
-      // Check if partial dispatch is enabled and
-      if (dev.settings().partialDispatch_ &&
-          // we couldn't find optimal workload
-          ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
-                // or size is too small for the cache line
-           (sizes.local()[0] < cacheLineMatch))) {
-        size_t maxSize = 0;
-        size_t maxDim = 0;
-        for (uint d = 0; d < sizes.dimensions(); ++d) {
-          if (maxSize < sizes.global()[d]) {
-            maxSize = sizes.global()[d];
-            maxDim = d;
-          }
-        }
-
-        if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
-          sizes.local()[0] = cacheLineMatch;
-          thrPerGrp /= cacheLineMatch;
-          sizes.local()[maxDim] = thrPerGrp;
-          for (uint d = 1; d < sizes.dimensions(); ++d) {
-            if (d != maxDim) {
-              sizes.local()[d] = 1;
-            }
-          }
+      if (gpuKernel.imageEnable() &&
+          // and thread group is a multiple value of wavefronts
+          ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
+          // and it's 2 or 3-dimensional workload
+          (sizes.dimensions() > 1) &&
+          ((dev.settings().partialDispatch_) ||
+           (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
+          // Use 8x8 workgroup size if kernel has image writes)
+        if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
+          sizes.local()[0] = 8;
+          sizes.local()[1] = 8;
         }
         else {
-          // Check if a local workgroup has the most optimal size
-          if (thrPerGrp > maxSize) {
-            thrPerGrp = maxSize;
-          }
-          sizes.local()[maxDim] = thrPerGrp;
+          sizes.local()[0] = 16;
+          sizes.local()[1] = 16;
+        }
+        if (sizes.dimensions() == 3)  {
+          sizes.local()[2] = 1;
+        }
+      }
+      else {
+        size_t tmp = thrPerGrp;
+        // Split the local workgroup into the most efficient way
+        for (uint d = 0; d < sizes.dimensions(); ++d) {
+            size_t div = tmp;
+            for (; (sizes.global()[d] % div) != 0; div--)
+              ;
+            sizes.local()[d] = div;
+            tmp /= div;
+        }
+
+        // Assuming DWORD access
+        const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
+
+        // Check if partial dispatch is enabled and
+        if (dev.settings().partialDispatch_ &&
+            // we couldn't find optimal workload
+            ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
+                  // or size is too small for the cache line
+             (sizes.local()[0] < cacheLineMatch))) {
+          size_t maxSize = 0;
+          size_t maxDim = 0;
           for (uint d = 0; d < sizes.dimensions(); ++d) {
-            if (d != maxDim) {
-              sizes.local()[d] = 1;
+            if (maxSize < sizes.global()[d]) {
+              maxSize = sizes.global()[d];
+              maxDim = d;
+            }
+          }
+
+          if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
+            sizes.local()[0] = cacheLineMatch;
+            thrPerGrp /= cacheLineMatch;
+            sizes.local()[maxDim] = thrPerGrp;
+            for (uint d = 1; d < sizes.dimensions(); ++d) {
+              if (d != maxDim) {
+                sizes.local()[d] = 1;
+              }
+            }
+          }
+          else {
+            // Check if a local workgroup has the most optimal size
+            if (thrPerGrp > maxSize) {
+              thrPerGrp = maxSize;
+            }
+            sizes.local()[maxDim] = thrPerGrp;
+            for (uint d = 0; d < sizes.dimensions(); ++d) {
+              if (d != maxDim) {
+                sizes.local()[d] = 1;
+              }
             }
           }
         }