From 2fd27167df9c67ef971804b056099be5cc2815ea Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 19 Jan 2018 13:33:55 -0500
Subject: [PATCH] P4 to Git Change 1505728 by wchau@wchau_OCL_boltzmann on
2018/01/19 13:13:53
SWDEV-137270 - Add findLocalWorkSize to OpenCL runtime ROC device
- Add support for kernel using image argument
- fix the issue of not using the max workgroup size specified by the environment variables "maxWorkGroupSize*"
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#33 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#50 edit
[ROCm/clr commit: 87ad3f9692fc1570a3a9c4a4d789f7b12e9329c8]
---
.../rocclr/runtime/device/rocm/rockernel.cpp | 18 +++
.../rocclr/runtime/device/rocm/rockernel.hpp | 8 ++
.../runtime/device/rocm/rocsettings.cpp | 17 +++
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 111 +++++++++++-------
4 files changed, 110 insertions(+), 44 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index 01aa905f6f..99573a8da2 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arg->typeName_.c_str();
+ // set image related flags
+ if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+ flags_.imageEnable_ = true;
+ if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+ desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+ flags_.imageWrite_ = true;
+ }
+ }
+
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
@@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
+ // set image related flags
+ if (arg->type_ == ROC_ARGTYPE_IMAGE) {
+ flags_.imageEnable_ = true;
+ if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
+ desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
+ flags_.imageWrite_ = true;
+ }
+ }
+
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
index 782ca66c4e..926b304d00 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
@@ -127,10 +127,18 @@ class Kernel : public device::Kernel {
//! set internal kernel flag
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
+ //! Return TRUE if kernel uses images
+ bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
+
+ //! Return TRUE if kernel wirtes images
+ bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
+
protected:
union Flags {
struct {
uint internalKernel_ : 1; //!< Is a blit kernel?
+ uint imageEnable_ : 1; //!< Kernel uses images
+ uint imageWrite_ : 1; //!< Kernel writes images
};
uint value_;
Flags() : value_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
index 72423a67dc..442871edde 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -142,6 +142,23 @@ void Settings::override() {
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
+ if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
+ maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
+ }
+ if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
+ maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+ }
+
+ if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
+ maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
+ }
+ if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
+ maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+ }
+ if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
+ maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+ }
+
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 947eaa4c27..70dbf3bf8a 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
amd::NDRangeContainer sizes, device::Kernel* devKernel,
const roc::Device& dev) {
+ Kernel& gpuKernel = static_cast(*devKernel);
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
// Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
@@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
// Find threads per group
thrPerGrp = devKernel->workGroupInfo()->size_;
- size_t tmp = thrPerGrp;
- // Split the local workgroup into the most efficient way
- for (uint d = 0; d < sizes.dimensions(); ++d) {
- size_t div = tmp;
- for (; (sizes.global()[d] % div) != 0; div--)
- ;
- sizes.local()[d] = div;
- tmp /= div;
- }
-
- // Assuming DWORD access
- const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
-
- // Check if partial dispatch is enabled and
- if (dev.settings().partialDispatch_ &&
- // we couldn't find optimal workload
- ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
- // or size is too small for the cache line
- (sizes.local()[0] < cacheLineMatch))) {
- size_t maxSize = 0;
- size_t maxDim = 0;
- for (uint d = 0; d < sizes.dimensions(); ++d) {
- if (maxSize < sizes.global()[d]) {
- maxSize = sizes.global()[d];
- maxDim = d;
- }
- }
-
- if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
- sizes.local()[0] = cacheLineMatch;
- thrPerGrp /= cacheLineMatch;
- sizes.local()[maxDim] = thrPerGrp;
- for (uint d = 1; d < sizes.dimensions(); ++d) {
- if (d != maxDim) {
- sizes.local()[d] = 1;
- }
- }
+ if (gpuKernel.imageEnable() &&
+ // and thread group is a multiple value of wavefronts
+ ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
+ // and it's 2 or 3-dimensional workload
+ (sizes.dimensions() > 1) &&
+ ((dev.settings().partialDispatch_) ||
+ (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
+ // Use 8x8 workgroup size if kernel has image writes)
+ if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
+ sizes.local()[0] = 8;
+ sizes.local()[1] = 8;
}
else {
- // Check if a local workgroup has the most optimal size
- if (thrPerGrp > maxSize) {
- thrPerGrp = maxSize;
- }
- sizes.local()[maxDim] = thrPerGrp;
+ sizes.local()[0] = 16;
+ sizes.local()[1] = 16;
+ }
+ if (sizes.dimensions() == 3) {
+ sizes.local()[2] = 1;
+ }
+ }
+ else {
+ size_t tmp = thrPerGrp;
+ // Split the local workgroup into the most efficient way
+ for (uint d = 0; d < sizes.dimensions(); ++d) {
+ size_t div = tmp;
+ for (; (sizes.global()[d] % div) != 0; div--)
+ ;
+ sizes.local()[d] = div;
+ tmp /= div;
+ }
+
+ // Assuming DWORD access
+ const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
+
+ // Check if partial dispatch is enabled and
+ if (dev.settings().partialDispatch_ &&
+ // we couldn't find optimal workload
+ ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
+ // or size is too small for the cache line
+ (sizes.local()[0] < cacheLineMatch))) {
+ size_t maxSize = 0;
+ size_t maxDim = 0;
for (uint d = 0; d < sizes.dimensions(); ++d) {
- if (d != maxDim) {
- sizes.local()[d] = 1;
+ if (maxSize < sizes.global()[d]) {
+ maxSize = sizes.global()[d];
+ maxDim = d;
+ }
+ }
+
+ if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
+ sizes.local()[0] = cacheLineMatch;
+ thrPerGrp /= cacheLineMatch;
+ sizes.local()[maxDim] = thrPerGrp;
+ for (uint d = 1; d < sizes.dimensions(); ++d) {
+ if (d != maxDim) {
+ sizes.local()[d] = 1;
+ }
+ }
+ }
+ else {
+ // Check if a local workgroup has the most optimal size
+ if (thrPerGrp > maxSize) {
+ thrPerGrp = maxSize;
+ }
+ sizes.local()[maxDim] = thrPerGrp;
+ for (uint d = 0; d < sizes.dimensions(); ++d) {
+ if (d != maxDim) {
+ sizes.local()[d] = 1;
+ }
}
}
}