From 5ee211e801f1e20eb1e972213fb2d98c8bb47b03 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 29 Aug 2018 12:35:08 -0400 Subject: [PATCH] P4 to Git Change 1599472 by gandryey@gera-w8 on 2018/08/29 12:25:34 SWDEV-79445 - OCL generic changes and code clean-up - Move FindLocalWorkSize() logic to the abstraction layer - Replace the ROCr path with the common FindLocalWorkSize() functionality Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#227 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#314 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#330 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#132 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#22 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#36 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#67 edit --- rocclr/runtime/device/device.cpp | 31 ++++ rocclr/runtime/device/device.hpp | 1 + rocclr/runtime/device/devkernel.cpp | 179 ++++++++++++++++++--- rocclr/runtime/device/devkernel.hpp | 19 ++- rocclr/runtime/device/gpu/gpukernel.cpp | 103 +----------- rocclr/runtime/device/gpu/gpukernel.hpp | 7 - rocclr/runtime/device/pal/palkernel.cpp | 120 +------------- rocclr/runtime/device/pal/palkernel.hpp | 7 - rocclr/runtime/device/rocm/rockernel.cpp | 2 +- rocclr/runtime/device/rocm/rocsettings.cpp | 11 +- rocclr/runtime/device/rocm/rocsettings.hpp | 3 +- rocclr/runtime/device/rocm/rocvirtual.cpp | 141 +--------------- 12 files changed, 216 insertions(+), 408 deletions(-) diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp index acb52b0142..edacb8557c 100644 --- a/rocclr/runtime/device/device.cpp +++ b/rocclr/runtime/device/device.cpp @@ -585,6 +585,37 @@ Settings::Settings() { //!< concurrent Virtual GPUs for default } +void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, + const amd::Coord3D region, uint mapFlags, bool entire, + amd::Image* baseMip) { + // Map/Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); + + WriteMapInfo info = {}; + WriteMapInfo* pInfo = &info; + auto it = writeMapInfo_.find(mapAddress); + if (it != writeMapInfo_.end()) { + LogWarning("Double map of the same or overlapped region!"); + pInfo = &it->second; + } + + if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { + pInfo->origin_ = origin; + pInfo->region_ = region; + pInfo->entire_ = entire; + pInfo->unmapWrite_ = true; + } + if (mapFlags & CL_MAP_READ) { + pInfo->unmapRead_ = true; + } + pInfo->baseMip_ = baseMip; + + // Insert into the map if it's the first region + if (++pInfo->count_ == 1) { + writeMapInfo_.insert({ mapAddress, info }); + } +} + Program::Program(amd::Device& device) : device_(device), type_(TYPE_NONE), diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index 5a30609600..ec89e63f5b 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -517,6 +517,7 @@ class Settings : public amd::HeapObject { uint commandQueues_; //!< Field value for maximum number //!< concurrent Virtual GPUs for each backend + //! Default constructor Settings(); diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp index e1cfc0c42a..9c4b43f960 100644 --- a/rocclr/runtime/device/devkernel.cpp +++ b/rocclr/runtime/device/devkernel.cpp @@ -3,6 +3,7 @@ // #include "platform/runtime.hpp" #include "platform/program.hpp" +#include "platform/ndrange.hpp" #include "devkernel.hpp" #include "utils/macros.hpp" #include "utils/options.hpp" @@ -22,6 +23,7 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; namespace device { + // ================================================================================================ bool Kernel::createSignature( const parameters_t& params, uint32_t numParameters, uint32_t version) { @@ -63,45 +65,139 @@ bool Kernel::createSignature( return false; } +// ================================================================================================ Kernel::~Kernel() { delete signature_; } +// ================================================================================================ std::string Kernel::openclMangledName(const std::string& name) { const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); assert(bifSym && "symbol not found"); return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; } -void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, - const amd::Coord3D region, uint mapFlags, bool entire, - amd::Image* baseMip) { - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); +// ================================================================================================ +void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, + amd::NDRange& lclWorkSize) const { + // Initialize the default workgoup info + // Check if the kernel has the compiled sizes + if (workGroupInfo()->compileSize_[0] == 0) { + // Find the default local workgroup size, if it wasn't specified + if (lclWorkSize[0] == 0) { + bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); + bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); + bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - WriteMapInfo info = {}; - WriteMapInfo* pInfo = &info; - auto it = writeMapInfo_.find(mapAddress); - if (it != writeMapInfo_.end()) { - LogWarning("Double map of the same or overlapped region!"); - pInfo = &it->second; - } + bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || + ((workDim == 3) && b3DOverrideSet); + if (!overrideSet) { + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; - if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { - pInfo->origin_ = origin; - pInfo->region_ = region; - pInfo->entire_ = entire; - pInfo->unmapWrite_ = true; - } - if (mapFlags & CL_MAP_READ) { - pInfo->unmapRead_ = true; - } - pInfo->baseMip_ = baseMip; + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && ((dev().settings().partialDispatch_) || + (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } + else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } + else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } - // Insert into the map if it's the first region - if (++pInfo->count_ == 1) { - writeMapInfo_.insert({ mapAddress, info }); + // Assuming DWORD access + const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2; + + // Check if partial dispatch is enabled and + if (dev().settings().partialDispatch_ && + // we couldn't find optimal workload + (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch))) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + } + } + } + else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } + else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + else { + assert(0 && "Invalid workDim!"); + } + } + } + } + else { + for (uint d = 0; d < workDim; ++d) { + lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } } } - +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) using llvm::AMDGPU::HSAMD::AccessQualifier; using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; @@ -145,6 +241,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH } } #endif +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) { if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { @@ -189,6 +286,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is } #endif +// ================================================================================================ static const clk_value_type_t ClkValueMapType[6][6] = { { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, @@ -198,6 +296,7 @@ static const clk_value_type_t ClkValueMapType[6][6] = { { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, }; +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) { uint sizeType; @@ -274,6 +373,7 @@ static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t si return T_VOID; } #endif +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) { uint sizeType; @@ -351,9 +451,12 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { switch (argInfo->type) { @@ -392,6 +495,7 @@ static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) { if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { @@ -405,6 +509,8 @@ static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) { return 1; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { @@ -414,6 +520,7 @@ static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) { if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) { @@ -429,6 +536,8 @@ static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) { return false; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { @@ -441,9 +550,12 @@ static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) inline static int GetArgSizeOCL(const aclArgData* argInfo) { switch (argInfo->type) { @@ -481,6 +593,7 @@ inline static int GetArgSizeOCL(const aclArgData* argInfo) { } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) { if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { @@ -504,6 +617,8 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgM return CL_KERNEL_ARG_ADDRESS_PRIVATE; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { @@ -534,6 +649,7 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) { if (lcArg.mValueKind == ValueKind::Image) { @@ -550,6 +666,8 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArg return CL_KERNEL_ARG_ACCESS_NONE; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_IMAGE) { @@ -566,6 +684,7 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgDat } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) { cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; @@ -588,6 +707,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& return rv; } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) { cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; @@ -618,6 +739,7 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { // Iterate through the arguments and insert into parameterList @@ -689,6 +811,8 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); } #endif + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) { // Iterate through the arguments and insert into parameterList @@ -769,6 +893,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) { } #endif +// ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) void Kernel::InitPrintf(const std::vector& printfInfoStrings) { for (auto str : printfInfoStrings) { @@ -860,6 +985,8 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { } } #endif // defined(WITH_LIGHTNING_COMPILER) + +// ================================================================================================ #if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) { PrintfInfo info; diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp index 9140ae28d7..94200a92b0 100644 --- a/rocclr/runtime/device/devkernel.hpp +++ b/rocclr/runtime/device/devkernel.hpp @@ -35,6 +35,7 @@ namespace amd { class Device; class KernelSignature; +class NDRange; struct ProfilingCallback : public amd::HeapObject { virtual void callback(ulong duration, uint32_t waves) = 0; @@ -123,7 +124,10 @@ class Kernel : public amd::HeapObject { }; //! Default constructor - Kernel(const std::string& name) : name_(name), signature_(NULL) { + Kernel(const amd::Device& dev, const std::string& name) + : dev_(dev) + , name_(name) + , signature_(nullptr) { // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); // Due to std::string not being able to be memset to 0 workGroupInfo_.size_ = 0; @@ -193,13 +197,16 @@ class Kernel : public amd::HeapObject { //! Get profiling callback object virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) { - return NULL; + return nullptr; } virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const { return 0; } + //! Returns GPU device object, associated with this kernel + const amd::Device& dev() const { return dev_; } + void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } @@ -237,6 +244,13 @@ class Kernel : public amd::HeapObject { //! Return printf info array const std::vector& printfInfo() const { return printf_; } + //! Finds local workgroup size + void FindLocalWorkSize( + size_t workDim, //!< Work dimension + const amd::NDRange& gblWorkSize, //!< Global work size + amd::NDRange& lclWorkSize //!< Calculated local work size + ) const; + protected: //! Initializes the abstraction layer kernel parameters #if defined(WITH_LIGHTNING_COMPILER) @@ -252,6 +266,7 @@ class Kernel : public amd::HeapObject { //! Initializes HSAIL Printf metadata and info void InitPrintf(const aclPrintfFmt* aclPrintf); #endif + const amd::Device& dev_; //!< GPU device object std::string name_; //!< kernel name WorkGroupInfo workGroupInfo_; //!< device kernel info structure amd::KernelSignature* signature_; //!< kernel signature diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 26879b9639..fe88dbd499 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -515,7 +515,7 @@ clk_value_type_t KernelArg::type() const { NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev, const NullProgram& nullprog) - : device::Kernel(name), + : device::Kernel(gpuNullDev, name), buildError_(CL_BUILD_PROGRAM_FAILURE), gpuDev_(gpuNullDev), prog_(nullprog), @@ -3047,9 +3047,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum) - : device::Kernel(name), + : device::Kernel(prog->dev(), name), compileOptions_(compileOptions), - dev_(prog->dev()), prog_(*prog), index_(0), code_(NULL), @@ -3241,102 +3240,6 @@ const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } -void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize) const { - // Initialize the default workgoup info - // Check if the kernel has the compiled sizes - if (workGroupInfo()->compileSize_[0] == 0) { - // Find the default local workgroup size, if it wasn't specified - if (lclWorkSize[0] == 0) { - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - - bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || - ((workDim == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && ((dev().settings().partialDispatch_) || - (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } - - // Check if partial dispatch is enabled and - if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } - } else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } else { - assert(0 && "Invalid workDim!"); - } - } - } - } else { - for (uint d = 0; d < workDim; ++d) { - lclWorkSize[d] = workGroupInfo()->compileSize_[d]; - } - } -} - inline static void WriteAqlArg( unsigned char** dst, //!< The write pointer to the buffer const void* src, //!< The source pointer @@ -3576,7 +3479,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( const amd::NDRange& global = sizes.global(); // Check if runtime has to find local workgroup size - findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + FindLocalWorkSize(sizes.dimensions(), sizes.global(), local); hsaDisp->header = kDispatchPacketHeader; hsaDisp->setup = sizes.dimensions(); diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index d1543ee496..544cc9e9e7 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel { //! Returns spill reg size per workitem int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; } - //! Finds local workgroup size - void findLocalWorkSize(size_t workDim, //!< Work dimension - const amd::NDRange& gblWorkSize, //!< Global work size - amd::NDRange& lclWorkSize //!< Local work size - ) const; - //! Returns AQL packet in CPU memory //! if the kerenl arguments were successfully loaded, otherwise NULL hsa_kernel_dispatch_packet_t* loadArguments( @@ -870,7 +864,6 @@ class HSAILKernel : public device::Kernel { std::vector arguments_; //!< Vector list of HSAIL Arguments std::string compileOptions_; //!< compile used for finalizing this kernel amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU - const NullDevice& dev_; //!< GPU device object const HSAILProgram& prog_; //!< Reference to the parent program uint index_; //!< Kernel index in the program diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index a646e0c3e5..7c330fbbd4 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -69,9 +69,8 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { } HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) - : device::Kernel(name), + : device::Kernel(prog->dev(), name), compileOptions_(compileOptions), - dev_(prog->dev()), prog_(*prog), index_(0), code_(0), @@ -253,121 +252,6 @@ const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } -void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize) const { - // Initialize the default workgoup info - // Check if the kernel has the compiled sizes - if (workGroupInfo()->compileSize_[0] == 0) { - // Find the default local workgroup size, if it wasn't specified - if (lclWorkSize[0] == 0) { - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - - bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || - ((workDim == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && ((dev().settings().partialDispatch_) || - (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } - - // Assuming DWORD access - const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; - - // Check if partial dispatch is enabled and - if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch))) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } - } - } else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } else { - assert(0 && "Invalid workDim!"); - } - } - } - } else { - for (uint d = 0; d < workDim; ++d) { - lclWorkSize[d] = workGroupInfo()->compileSize_[d]; - } - } -} - hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { @@ -450,7 +334,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( const amd::NDRange& global = sizes.global(); // Check if runtime has to find local workgroup size - findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + FindLocalWorkSize(sizes.dimensions(), sizes.global(), local); constexpr uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp index 525bc517e3..43fc6ff185 100644 --- a/rocclr/runtime/device/pal/palkernel.hpp +++ b/rocclr/runtime/device/pal/palkernel.hpp @@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel { //! Returns spill reg size per workitem int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); } - //! Finds local workgroup size - void findLocalWorkSize(size_t workDim, //!< Work dimension - const amd::NDRange& gblWorkSize, //!< Global work size - amd::NDRange& lclWorkSize //!< Local work size - ) const; - //! Returns AQL packet in CPU memory //! if the kernel arguments were successfully loaded, otherwise NULL hsa_kernel_dispatch_packet_t* loadArguments( @@ -127,7 +121,6 @@ class HSAILKernel : public device::Kernel { std::string compileOptions_; //!< compile used for finalizing this kernel amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU - const NullDevice& dev_; //!< GPU device object const HSAILProgram& prog_; //!< Reference to the parent program uint index_; //!< Kernel index in the program diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp index 227758da2c..a63c9f9767 100644 --- a/rocclr/runtime/device/rocm/rockernel.cpp +++ b/rocclr/runtime/device/rocm/rockernel.cpp @@ -15,7 +15,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle const uint32_t workgroupGroupSegmentByteSize, const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, const uint32_t kernargSegmentAlignment) - : device::Kernel(name), + : device::Kernel(prog->dev(), name), program_(prog), kernelCodeHandle_(kernelCodeHandle), workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize), diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp index 039016781d..31e235eeb1 100644 --- a/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/rocclr/runtime/device/rocm/rocsettings.cpp @@ -43,16 +43,7 @@ Settings::Settings() { nonCoherentMode = getenv("OPENCL_USE_NC_MEMORY_POLICY"); enableNCMode_ = (nonCoherentMode) ? true : false; - // Determine if user wishes to disable support for - // partial dispatch. By default support for partial - // dispatch is enabled. Users can turn it off for - // devices that do not support this feature. - // - // @note Update appropriate field of device::Settings - char* partialDispatch = nullptr; - partialDispatch = getenv("OPENCL_DISABLE_PARTIAL_DISPATCH"); - enablePartialDispatch_ = (partialDispatch) ? false : true; - partialDispatch_ = (partialDispatch) ? false : true; + partialDispatch_ = GPU_PARTIAL_DISPATCH; commandQueues_ = 100; //!< Field value set to maximum number //!< concurrent Virtual GPUs for ROCm backend diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp index d7d6dd11ef..4462907694 100644 --- a/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/rocclr/runtime/device/rocm/rocsettings.hpp @@ -24,13 +24,12 @@ class Settings : public device::Settings { uint enableLocalMemory_ : 1; //!< Enable GPUVM memory uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory - uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read uint stagedXferWrite_ : 1; //!< Uses a staged buffer write uint singleFpDenorm_ : 1; //!< Support Single FP Denorm uint apuSystem_ : 1; //!< APU system - uint reserved_ : 21; + uint reserved_ : 22; }; uint value_; }; diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index f9dae19ac1..4af1354d6e 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1764,132 +1764,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { profilingEnd(vcmd); } -// Over rides the workgroup size fields in the packet with runtime/compiler set sizes -void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, - amd::NDRangeContainer sizes, device::Kernel* devKernel, - const roc::Device& dev) { - - Kernel& gpuKernel = static_cast(*devKernel); - const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; - - // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid - // else this error check is incorrect - if (compile_size[0] || compile_size[1] || compile_size[2]) { - dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? compile_size[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? compile_size[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? compile_size[2] : 1; - } else { - size_t thrPerGrp; - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - - bool overrideSet = ((sizes.dimensions() == 1) && b1DOverrideSet) || - ((sizes.dimensions() == 2) && b2DOverrideSet) || - ((sizes.dimensions() == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - thrPerGrp = devKernel->workGroupInfo()->size_; - - if (gpuKernel.imageEnable() && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (sizes.dimensions() > 1) && - ((dev.settings().partialDispatch_) || - (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes) - if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) { - sizes.local()[0] = 8; - sizes.local()[1] = 8; - } - else { - sizes.local()[0] = 16; - sizes.local()[1] = 16; - } - if (sizes.dimensions() == 3) { - sizes.local()[2] = 1; - } - } - else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < sizes.dimensions(); ++d) { - size_t div = tmp; - for (; (sizes.global()[d] % div) != 0; div--) - ; - sizes.local()[d] = div; - tmp /= div; - } - - // Assuming DWORD access - const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2; - - // Check if partial dispatch is enabled and - if (dev.settings().partialDispatch_ && - // we couldn't find optimal workload - ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 || - // or size is too small for the cache line - (sizes.local()[0] < cacheLineMatch))) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < sizes.dimensions(); ++d) { - if (maxSize < sizes.global()[d]) { - maxSize = sizes.global()[d]; - maxDim = d; - } - } - - if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) { - sizes.local()[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - sizes.local()[maxDim] = thrPerGrp; - for (uint d = 1; d < sizes.dimensions(); ++d) { - if (d != maxDim) { - sizes.local()[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - sizes.local()[maxDim] = thrPerGrp; - for (uint d = 0; d < sizes.dimensions(); ++d) { - if (d != maxDim) { - sizes.local()[d] = 1; - } - } - } - } - } - dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1; - } else { - // Runtime must set the group size - dispatchPacket.workgroup_size_x = 1; - dispatchPacket.workgroup_size_y = 1; - dispatchPacket.workgroup_size_z = 1; - - if (sizes.dimensions() == 1) { - dispatchPacket.workgroup_size_x = dev.settings().preferredWorkGroupSize_; - } else if (sizes.dimensions() == 2) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_; - } else if (sizes.dimensions() == 3) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_; - dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_; - } - } - } -} - bool VirtualGPU::createSchedulerParam() { if (nullptr != schedulerParam_) { @@ -2235,15 +2109,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1; dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1; - if (sizes.local().product() != 0) { - dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1; - } else { - amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0], - &(const_cast(sizes).local()[0])); - setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, devKernel, dev()); - } + amd::NDRange local(sizes.local()); + devKernel->FindLocalWorkSize(sizes.dimensions(), sizes.global(), local); + dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? local[0] : 1; + dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? local[1] : 1; + dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1; + dispatchPacket.kernarg_address = argBuffer; dispatchPacket.group_segment_size = ldsUsage; dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;