P4 to Git Change 1505728 by wchau@wchau_OCL_boltzmann on 2018/01/19 13:13:53

SWDEV-137270 - Add findLocalWorkSize to OpenCL runtime ROC device - Add support for kernel using image argument - fix the issue of not using the max workgroup size specified by the environment variables "maxWorkGroupSize*" Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#33 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#50 edit [ROCm/clr commit: 87ad3f9692]
2018-01-19 13:33:55 -05:00
@@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
    desc.typeQualifier_ = GetOclTypeQual(aclArg);
    desc.typeName_ = arg->typeName_.c_str();

+    // set image related flags
+    if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+      flags_.imageEnable_ = true;
+      if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+          desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+        flags_.imageWrite_ = true;
+      }
+    }
+
    // Make a check if it is local or global
    if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
      desc.size_ = 0;
@@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
    desc.typeQualifier_ = GetOclTypeQual(lcArg);
    desc.typeName_ = lcArg.mTypeName.c_str();

+    // set image related flags
+    if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+      flags_.imageEnable_ = true;
+      if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+          desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+        flags_.imageWrite_ = true;
+      }
+    }
+
    // Make a check if it is local or global
    if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
      desc.size_ = 0;
@@ -127,10 +127,18 @@ class Kernel : public device::Kernel {
  //! set internal kernel flag
  void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }

+  //! Return TRUE if kernel uses images
+  bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
+
+  //! Return TRUE if kernel wirtes images
+  bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
+
 protected:
  union Flags {
    struct {
      uint internalKernel_ : 1;  //!< Is a blit kernel?
+      uint imageEnable_    : 1;  //!< Kernel uses images
+      uint imageWrite_     : 1;  //!< Kernel writes images
    };
    uint value_;
    Flags() : value_(0) {}
@@ -142,6 +142,23 @@ void Settings::override() {
    preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
  }

+  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
+    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
+    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+  }
+
+  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
+    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
+    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
+    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+  }
+
  if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
    commandQueues_ = GPU_MAX_COMMAND_QUEUES;
  }
@@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
                                 amd::NDRangeContainer sizes, device::Kernel* devKernel,
                                 const roc::Device& dev) {

+  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
  const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;

  // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
@@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
      // Find threads per group
      thrPerGrp = devKernel->workGroupInfo()->size_;

-      size_t tmp = thrPerGrp;
-      // Split the local workgroup into the most efficient way
-      for (uint d = 0; d < sizes.dimensions(); ++d) {
-          size_t div = tmp;
-          for (; (sizes.global()[d] % div) != 0; div--)
-            ;
-          sizes.local()[d] = div;
-          tmp /= div;
-      }
-
-      // Assuming DWORD access
-      const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
-
-      // Check if partial dispatch is enabled and
-      if (dev.settings().partialDispatch_ &&
-          // we couldn't find optimal workload
-          ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
-                // or size is too small for the cache line
-           (sizes.local()[0] < cacheLineMatch))) {
-        size_t maxSize = 0;
-        size_t maxDim = 0;
-        for (uint d = 0; d < sizes.dimensions(); ++d) {
-          if (maxSize < sizes.global()[d]) {
-            maxSize = sizes.global()[d];
-            maxDim = d;
-          }
-        }
-
-        if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
-          sizes.local()[0] = cacheLineMatch;
-          thrPerGrp /= cacheLineMatch;
-          sizes.local()[maxDim] = thrPerGrp;
-          for (uint d = 1; d < sizes.dimensions(); ++d) {
-            if (d != maxDim) {
-              sizes.local()[d] = 1;
-            }
-          }
+      if (gpuKernel.imageEnable() &&
+          // and thread group is a multiple value of wavefronts
+          ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
+          // and it's 2 or 3-dimensional workload
+          (sizes.dimensions() > 1) &&
+          ((dev.settings().partialDispatch_) ||
+           (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
+          // Use 8x8 workgroup size if kernel has image writes)
+        if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
+          sizes.local()[0] = 8;
+          sizes.local()[1] = 8;
        }
        else {
-          // Check if a local workgroup has the most optimal size
-          if (thrPerGrp > maxSize) {
-            thrPerGrp = maxSize;
-          }
-          sizes.local()[maxDim] = thrPerGrp;
+          sizes.local()[0] = 16;
+          sizes.local()[1] = 16;
+        }
+        if (sizes.dimensions() == 3)  {
+          sizes.local()[2] = 1;
+        }
+      }
+      else {
+        size_t tmp = thrPerGrp;
+        // Split the local workgroup into the most efficient way
+        for (uint d = 0; d < sizes.dimensions(); ++d) {
+            size_t div = tmp;
+            for (; (sizes.global()[d] % div) != 0; div--)
+              ;
+            sizes.local()[d] = div;
+            tmp /= div;
+        }
+
+        // Assuming DWORD access
+        const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
+
+        // Check if partial dispatch is enabled and
+        if (dev.settings().partialDispatch_ &&
+            // we couldn't find optimal workload
+            ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
+                  // or size is too small for the cache line
+             (sizes.local()[0] < cacheLineMatch))) {
+          size_t maxSize = 0;
+          size_t maxDim = 0;
          for (uint d = 0; d < sizes.dimensions(); ++d) {
-            if (d != maxDim) {
-              sizes.local()[d] = 1;
+            if (maxSize < sizes.global()[d]) {
+              maxSize = sizes.global()[d];
+              maxDim = d;
+            }
+          }
+
+          if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
+            sizes.local()[0] = cacheLineMatch;
+            thrPerGrp /= cacheLineMatch;
+            sizes.local()[maxDim] = thrPerGrp;
+            for (uint d = 1; d < sizes.dimensions(); ++d) {
+              if (d != maxDim) {
+                sizes.local()[d] = 1;
+              }
+            }
+          }
+          else {
+            // Check if a local workgroup has the most optimal size
+            if (thrPerGrp > maxSize) {
+              thrPerGrp = maxSize;
+            }
+            sizes.local()[maxDim] = thrPerGrp;
+            for (uint d = 0; d < sizes.dimensions(); ++d) {
+              if (d != maxDim) {
+                sizes.local()[d] = 1;
+              }
            }
          }
        }