diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index 01aa905f6f..99573a8da2 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) { desc.typeQualifier_ = GetOclTypeQual(aclArg); desc.typeName_ = arg->typeName_.c_str(); + // set image related flags + if (arg->type_ == ROC_ARGTYPE_IMAGE) { + flags_.imageEnable_ = true; + if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY || + desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) { + flags_.imageWrite_ = true; + } + } + // Make a check if it is local or global if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { desc.size_ = 0; @@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) { desc.typeQualifier_ = GetOclTypeQual(lcArg); desc.typeName_ = lcArg.mTypeName.c_str(); + // set image related flags + if (arg->type_ == ROC_ARGTYPE_IMAGE) { + flags_.imageEnable_ = true; + if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY || + desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) { + flags_.imageWrite_ = true; + } + } + // Make a check if it is local or global if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { desc.size_ = 0; diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp index 782ca66c4e..926b304d00 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp @@ -127,10 +127,18 @@ class Kernel : public device::Kernel { //! set internal kernel flag void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } + //! Return TRUE if kernel uses images + bool imageEnable() const { return (flags_.imageEnable_) ? true : false; } + + //! Return TRUE if kernel wirtes images + bool imageWrite() const { return (flags_.imageWrite_) ? true : false; } + protected: union Flags { struct { uint internalKernel_ : 1; //!< Is a blit kernel? + uint imageEnable_ : 1; //!< Kernel uses images + uint imageWrite_ : 1; //!< Kernel writes images }; uint value_; Flags() : value_(0) {} diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp index 72423a67dc..442871edde 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp @@ -142,6 +142,23 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } + if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { + maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { + maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + + if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { + maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { + maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { + maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) { commandQueues_ = GPU_MAX_COMMAND_QUEUES; } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 947eaa4c27..70dbf3bf8a 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, amd::NDRangeContainer sizes, device::Kernel* devKernel, const roc::Device& dev) { + Kernel& gpuKernel = static_cast(*devKernel); const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid @@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, // Find threads per group thrPerGrp = devKernel->workGroupInfo()->size_; - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < sizes.dimensions(); ++d) { - size_t div = tmp; - for (; (sizes.global()[d] % div) != 0; div--) - ; - sizes.local()[d] = div; - tmp /= div; - } - - // Assuming DWORD access - const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2; - - // Check if partial dispatch is enabled and - if (dev.settings().partialDispatch_ && - // we couldn't find optimal workload - ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 || - // or size is too small for the cache line - (sizes.local()[0] < cacheLineMatch))) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < sizes.dimensions(); ++d) { - if (maxSize < sizes.global()[d]) { - maxSize = sizes.global()[d]; - maxDim = d; - } - } - - if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) { - sizes.local()[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - sizes.local()[maxDim] = thrPerGrp; - for (uint d = 1; d < sizes.dimensions(); ++d) { - if (d != maxDim) { - sizes.local()[d] = 1; - } - } + if (gpuKernel.imageEnable() && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (sizes.dimensions() > 1) && + ((dev.settings().partialDispatch_) || + (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes) + if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) { + sizes.local()[0] = 8; + sizes.local()[1] = 8; } else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - sizes.local()[maxDim] = thrPerGrp; + sizes.local()[0] = 16; + sizes.local()[1] = 16; + } + if (sizes.dimensions() == 3) { + sizes.local()[2] = 1; + } + } + else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < sizes.dimensions(); ++d) { + size_t div = tmp; + for (; (sizes.global()[d] % div) != 0; div--) + ; + sizes.local()[d] = div; + tmp /= div; + } + + // Assuming DWORD access + const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2; + + // Check if partial dispatch is enabled and + if (dev.settings().partialDispatch_ && + // we couldn't find optimal workload + ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 || + // or size is too small for the cache line + (sizes.local()[0] < cacheLineMatch))) { + size_t maxSize = 0; + size_t maxDim = 0; for (uint d = 0; d < sizes.dimensions(); ++d) { - if (d != maxDim) { - sizes.local()[d] = 1; + if (maxSize < sizes.global()[d]) { + maxSize = sizes.global()[d]; + maxDim = d; + } + } + + if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) { + sizes.local()[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + sizes.local()[maxDim] = thrPerGrp; + for (uint d = 1; d < sizes.dimensions(); ++d) { + if (d != maxDim) { + sizes.local()[d] = 1; + } + } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + sizes.local()[maxDim] = thrPerGrp; + for (uint d = 0; d < sizes.dimensions(); ++d) { + if (d != maxDim) { + sizes.local()[d] = 1; + } } } }