From 5ee211e801f1e20eb1e972213fb2d98c8bb47b03 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 29 Aug 2018 12:35:08 -0400
Subject: [PATCH] P4 to Git Change 1599472 by gandryey@gera-w8 on 2018/08/29
12:25:34
SWDEV-79445 - OCL generic changes and code clean-up
- Move FindLocalWorkSize() logic to the abstraction layer
- Replace the ROCr path with the common FindLocalWorkSize() functionality
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#227 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#314 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#330 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#67 edit
---
rocclr/runtime/device/device.cpp | 31 ++++
rocclr/runtime/device/device.hpp | 1 +
rocclr/runtime/device/devkernel.cpp | 179 ++++++++++++++++++---
rocclr/runtime/device/devkernel.hpp | 19 ++-
rocclr/runtime/device/gpu/gpukernel.cpp | 103 +-----------
rocclr/runtime/device/gpu/gpukernel.hpp | 7 -
rocclr/runtime/device/pal/palkernel.cpp | 120 +-------------
rocclr/runtime/device/pal/palkernel.hpp | 7 -
rocclr/runtime/device/rocm/rockernel.cpp | 2 +-
rocclr/runtime/device/rocm/rocsettings.cpp | 11 +-
rocclr/runtime/device/rocm/rocsettings.hpp | 3 +-
rocclr/runtime/device/rocm/rocvirtual.cpp | 141 +---------------
12 files changed, 216 insertions(+), 408 deletions(-)
diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp
index acb52b0142..edacb8557c 100644
--- a/rocclr/runtime/device/device.cpp
+++ b/rocclr/runtime/device/device.cpp
@@ -585,6 +585,37 @@ Settings::Settings() {
//!< concurrent Virtual GPUs for default
}
+void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
+ const amd::Coord3D region, uint mapFlags, bool entire,
+ amd::Image* baseMip) {
+ // Map/Unmap must be serialized.
+ amd::ScopedLock lock(owner()->lockMemoryOps());
+
+ WriteMapInfo info = {};
+ WriteMapInfo* pInfo = &info;
+ auto it = writeMapInfo_.find(mapAddress);
+ if (it != writeMapInfo_.end()) {
+ LogWarning("Double map of the same or overlapped region!");
+ pInfo = &it->second;
+ }
+
+ if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
+ pInfo->origin_ = origin;
+ pInfo->region_ = region;
+ pInfo->entire_ = entire;
+ pInfo->unmapWrite_ = true;
+ }
+ if (mapFlags & CL_MAP_READ) {
+ pInfo->unmapRead_ = true;
+ }
+ pInfo->baseMip_ = baseMip;
+
+ // Insert into the map if it's the first region
+ if (++pInfo->count_ == 1) {
+ writeMapInfo_.insert({ mapAddress, info });
+ }
+}
+
Program::Program(amd::Device& device)
: device_(device),
type_(TYPE_NONE),
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index 5a30609600..ec89e63f5b 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -517,6 +517,7 @@ class Settings : public amd::HeapObject {
uint commandQueues_; //!< Field value for maximum number
//!< concurrent Virtual GPUs for each backend
+
//! Default constructor
Settings();
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index e1cfc0c42a..9c4b43f960 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -3,6 +3,7 @@
//
#include "platform/runtime.hpp"
#include "platform/program.hpp"
+#include "platform/ndrange.hpp"
#include "devkernel.hpp"
#include "utils/macros.hpp"
#include "utils/options.hpp"
@@ -22,6 +23,7 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
namespace device {
+ // ================================================================================================
bool Kernel::createSignature(
const parameters_t& params, uint32_t numParameters,
uint32_t version) {
@@ -63,45 +65,139 @@ bool Kernel::createSignature(
return false;
}
+// ================================================================================================
Kernel::~Kernel() { delete signature_; }
+// ================================================================================================
std::string Kernel::openclMangledName(const std::string& name) {
const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
assert(bifSym && "symbol not found");
return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
}
-void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
- const amd::Coord3D region, uint mapFlags, bool entire,
- amd::Image* baseMip) {
- // Map/Unmap must be serialized.
- amd::ScopedLock lock(owner()->lockMemoryOps());
+// ================================================================================================
+void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
+ amd::NDRange& lclWorkSize) const {
+ // Initialize the default workgoup info
+ // Check if the kernel has the compiled sizes
+ if (workGroupInfo()->compileSize_[0] == 0) {
+ // Find the default local workgroup size, if it wasn't specified
+ if (lclWorkSize[0] == 0) {
+ bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
+ bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
+ !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
+ bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
+ !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
+ !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
- WriteMapInfo info = {};
- WriteMapInfo* pInfo = &info;
- auto it = writeMapInfo_.find(mapAddress);
- if (it != writeMapInfo_.end()) {
- LogWarning("Double map of the same or overlapped region!");
- pInfo = &it->second;
- }
+ bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
+ ((workDim == 3) && b3DOverrideSet);
+ if (!overrideSet) {
+ // Find threads per group
+ size_t thrPerGrp = workGroupInfo()->size_;
- if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
- pInfo->origin_ = origin;
- pInfo->region_ = region;
- pInfo->entire_ = entire;
- pInfo->unmapWrite_ = true;
- }
- if (mapFlags & CL_MAP_READ) {
- pInfo->unmapRead_ = true;
- }
- pInfo->baseMip_ = baseMip;
+ // Check if kernel uses images
+ if (flags_.imageEna_ &&
+ // and thread group is a multiple value of wavefronts
+ ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+ // and it's 2 or 3-dimensional workload
+ (workDim > 1) && ((dev().settings().partialDispatch_) ||
+ (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
+ // Use 8x8 workgroup size if kernel has image writes
+ if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
+ lclWorkSize[0] = 8;
+ lclWorkSize[1] = 8;
+ }
+ else {
+ lclWorkSize[0] = 16;
+ lclWorkSize[1] = 16;
+ }
+ if (workDim == 3) {
+ lclWorkSize[2] = 1;
+ }
+ }
+ else {
+ size_t tmp = thrPerGrp;
+ // Split the local workgroup into the most efficient way
+ for (uint d = 0; d < workDim; ++d) {
+ size_t div = tmp;
+ for (; (gblWorkSize[d] % div) != 0; div--)
+ ;
+ lclWorkSize[d] = div;
+ tmp /= div;
+ }
- // Insert into the map if it's the first region
- if (++pInfo->count_ == 1) {
- writeMapInfo_.insert({ mapAddress, info });
+ // Assuming DWORD access
+ const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2;
+
+ // Check if partial dispatch is enabled and
+ if (dev().settings().partialDispatch_ &&
+ // we couldn't find optimal workload
+ (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+ // or size is too small for the cache line
+ (lclWorkSize[0] < cacheLineMatch))) {
+ size_t maxSize = 0;
+ size_t maxDim = 0;
+ for (uint d = 0; d < workDim; ++d) {
+ if (maxSize < gblWorkSize[d]) {
+ maxSize = gblWorkSize[d];
+ maxDim = d;
+ }
+ }
+ // Use X dimension as high priority. Runtime will assume that
+ // X dimension is more important for the address calculation
+ if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+ lclWorkSize[0] = cacheLineMatch;
+ thrPerGrp /= cacheLineMatch;
+ lclWorkSize[maxDim] = thrPerGrp;
+ for (uint d = 1; d < workDim; ++d) {
+ if (d != maxDim) {
+ lclWorkSize[d] = 1;
+ }
+ }
+ }
+ else {
+ // Check if a local workgroup has the most optimal size
+ if (thrPerGrp > maxSize) {
+ thrPerGrp = maxSize;
+ }
+ lclWorkSize[maxDim] = thrPerGrp;
+ for (uint d = 0; d < workDim; ++d) {
+ if (d != maxDim) {
+ lclWorkSize[d] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ else {
+ // Use overrides when app doesn't provide workgroup dimensions
+ if (workDim == 1) {
+ lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
+ }
+ else if (workDim == 2) {
+ lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
+ lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+ }
+ else if (workDim == 3) {
+ lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
+ lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+ lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+ }
+ else {
+ assert(0 && "Invalid workDim!");
+ }
+ }
+ }
+ }
+ else {
+ for (uint d = 0; d < workDim; ++d) {
+ lclWorkSize[d] = workGroupInfo()->compileSize_[d];
+ }
}
}
-
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
using llvm::AMDGPU::HSAMD::AccessQualifier;
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
@@ -145,6 +241,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
}
}
#endif
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
@@ -189,6 +286,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
}
#endif
+// ================================================================================================
static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
@@ -198,6 +296,7 @@ static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
};
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
uint sizeType;
@@ -274,6 +373,7 @@ static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t si
return T_VOID;
}
#endif
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
uint sizeType;
@@ -351,9 +451,12 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
switch (argInfo->type) {
@@ -392,6 +495,7 @@ static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -405,6 +509,8 @@ static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
return 1;
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -414,6 +520,7 @@ static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
@@ -429,6 +536,8 @@ static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
return false;
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -441,9 +550,12 @@ static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
inline static int GetArgSizeOCL(const aclArgData* argInfo) {
switch (argInfo->type) {
@@ -481,6 +593,7 @@ inline static int GetArgSizeOCL(const aclArgData* argInfo) {
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -504,6 +617,8 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgM
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -534,6 +649,7 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::Image) {
@@ -550,6 +666,8 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArg
return CL_KERNEL_ARG_ACCESS_NONE;
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_IMAGE) {
@@ -566,6 +684,7 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgDat
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -588,6 +707,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD&
return rv;
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -618,6 +739,7 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
// Iterate through the arguments and insert into parameterList
@@ -689,6 +811,8 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
// Iterate through the arguments and insert into parameterList
@@ -769,6 +893,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
}
#endif
+// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitPrintf(const std::vector& printfInfoStrings) {
for (auto str : printfInfoStrings) {
@@ -860,6 +985,8 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) {
}
}
#endif // defined(WITH_LIGHTNING_COMPILER)
+
+// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) {
PrintfInfo info;
diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp
index 9140ae28d7..94200a92b0 100644
--- a/rocclr/runtime/device/devkernel.hpp
+++ b/rocclr/runtime/device/devkernel.hpp
@@ -35,6 +35,7 @@ namespace amd {
class Device;
class KernelSignature;
+class NDRange;
struct ProfilingCallback : public amd::HeapObject {
virtual void callback(ulong duration, uint32_t waves) = 0;
@@ -123,7 +124,10 @@ class Kernel : public amd::HeapObject {
};
//! Default constructor
- Kernel(const std::string& name) : name_(name), signature_(NULL) {
+ Kernel(const amd::Device& dev, const std::string& name)
+ : dev_(dev)
+ , name_(name)
+ , signature_(nullptr) {
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
// Due to std::string not being able to be memset to 0
workGroupInfo_.size_ = 0;
@@ -193,13 +197,16 @@ class Kernel : public amd::HeapObject {
//! Get profiling callback object
virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
- return NULL;
+ return nullptr;
}
virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
return 0;
}
+ //! Returns GPU device object, associated with this kernel
+ const amd::Device& dev() const { return dev_; }
+
void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
@@ -237,6 +244,13 @@ class Kernel : public amd::HeapObject {
//! Return printf info array
const std::vector& printfInfo() const { return printf_; }
+ //! Finds local workgroup size
+ void FindLocalWorkSize(
+ size_t workDim, //!< Work dimension
+ const amd::NDRange& gblWorkSize, //!< Global work size
+ amd::NDRange& lclWorkSize //!< Calculated local work size
+ ) const;
+
protected:
//! Initializes the abstraction layer kernel parameters
#if defined(WITH_LIGHTNING_COMPILER)
@@ -252,6 +266,7 @@ class Kernel : public amd::HeapObject {
//! Initializes HSAIL Printf metadata and info
void InitPrintf(const aclPrintfFmt* aclPrintf);
#endif
+ const amd::Device& dev_; //!< GPU device object
std::string name_; //!< kernel name
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
amd::KernelSignature* signature_; //!< kernel signature
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 26879b9639..fe88dbd499 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -515,7 +515,7 @@ clk_value_type_t KernelArg::type() const {
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
const NullProgram& nullprog)
- : device::Kernel(name),
+ : device::Kernel(gpuNullDev, name),
buildError_(CL_BUILD_PROGRAM_FAILURE),
gpuDev_(gpuNullDev),
prog_(nullprog),
@@ -3047,9 +3047,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
uint extraArgsNum)
- : device::Kernel(name),
+ : device::Kernel(prog->dev(), name),
compileOptions_(compileOptions),
- dev_(prog->dev()),
prog_(*prog),
index_(0),
code_(NULL),
@@ -3241,102 +3240,6 @@ const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast(prog_);
}
-void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
- amd::NDRange& lclWorkSize) const {
- // Initialize the default workgoup info
- // Check if the kernel has the compiled sizes
- if (workGroupInfo()->compileSize_[0] == 0) {
- // Find the default local workgroup size, if it wasn't specified
- if (lclWorkSize[0] == 0) {
- bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
- bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
- bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
- bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
- ((workDim == 3) && b3DOverrideSet);
- if (!overrideSet) {
- // Find threads per group
- size_t thrPerGrp = workGroupInfo()->size_;
-
- // Check if kernel uses images
- if (flags_.imageEna_ &&
- // and thread group is a multiple value of wavefronts
- ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
- // and it's 2 or 3-dimensional workload
- (workDim > 1) && ((dev().settings().partialDispatch_) ||
- (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
- // Use 8x8 workgroup size if kernel has image writes
- if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
- lclWorkSize[0] = 8;
- lclWorkSize[1] = 8;
- } else {
- lclWorkSize[0] = 16;
- lclWorkSize[1] = 16;
- }
- if (workDim == 3) {
- lclWorkSize[2] = 1;
- }
- } else {
- size_t tmp = thrPerGrp;
- // Split the local workgroup into the most efficient way
- for (uint d = 0; d < workDim; ++d) {
- size_t div = tmp;
- for (; (gblWorkSize[d] % div) != 0; div--)
- ;
- lclWorkSize[d] = div;
- tmp /= div;
- }
-
- // Check if partial dispatch is enabled and
- if (dev().settings().partialDispatch_ &&
- // we couldn't find optimal workload
- (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
- size_t maxSize = 0;
- size_t maxDim = 0;
- for (uint d = 0; d < workDim; ++d) {
- if (maxSize < gblWorkSize[d]) {
- maxSize = gblWorkSize[d];
- maxDim = d;
- }
- }
- // Check if a local workgroup has the most optimal size
- if (thrPerGrp > maxSize) {
- thrPerGrp = maxSize;
- }
- lclWorkSize[maxDim] = thrPerGrp;
- for (uint d = 0; d < workDim; ++d) {
- if (d != maxDim) {
- lclWorkSize[d] = 1;
- }
- }
- }
- }
- } else {
- // Use overrides when app doesn't provide workgroup dimensions
- if (workDim == 1) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
- } else if (workDim == 2) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
- lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
- } else if (workDim == 3) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
- lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
- lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
- } else {
- assert(0 && "Invalid workDim!");
- }
- }
- }
- } else {
- for (uint d = 0; d < workDim; ++d) {
- lclWorkSize[d] = workGroupInfo()->compileSize_[d];
- }
- }
-}
-
inline static void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const void* src, //!< The source pointer
@@ -3576,7 +3479,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
- findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+ FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index d1543ee496..544cc9e9e7 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel {
//! Returns spill reg size per workitem
int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; }
- //! Finds local workgroup size
- void findLocalWorkSize(size_t workDim, //!< Work dimension
- const amd::NDRange& gblWorkSize, //!< Global work size
- amd::NDRange& lclWorkSize //!< Local work size
- ) const;
-
//! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
hsa_kernel_dispatch_packet_t* loadArguments(
@@ -870,7 +864,6 @@ class HSAILKernel : public device::Kernel {
std::vector arguments_; //!< Vector list of HSAIL Arguments
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
- const NullDevice& dev_; //!< GPU device object
const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index a646e0c3e5..7c330fbbd4 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -69,9 +69,8 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
- : device::Kernel(name),
+ : device::Kernel(prog->dev(), name),
compileOptions_(compileOptions),
- dev_(prog->dev()),
prog_(*prog),
index_(0),
code_(0),
@@ -253,121 +252,6 @@ const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast(prog_);
}
-void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
- amd::NDRange& lclWorkSize) const {
- // Initialize the default workgoup info
- // Check if the kernel has the compiled sizes
- if (workGroupInfo()->compileSize_[0] == 0) {
- // Find the default local workgroup size, if it wasn't specified
- if (lclWorkSize[0] == 0) {
- bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
- bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
- bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
- bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
- ((workDim == 3) && b3DOverrideSet);
- if (!overrideSet) {
- // Find threads per group
- size_t thrPerGrp = workGroupInfo()->size_;
-
- // Check if kernel uses images
- if (flags_.imageEna_ &&
- // and thread group is a multiple value of wavefronts
- ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
- // and it's 2 or 3-dimensional workload
- (workDim > 1) && ((dev().settings().partialDispatch_) ||
- (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
- // Use 8x8 workgroup size if kernel has image writes
- if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
- lclWorkSize[0] = 8;
- lclWorkSize[1] = 8;
- } else {
- lclWorkSize[0] = 16;
- lclWorkSize[1] = 16;
- }
- if (workDim == 3) {
- lclWorkSize[2] = 1;
- }
- } else {
- size_t tmp = thrPerGrp;
- // Split the local workgroup into the most efficient way
- for (uint d = 0; d < workDim; ++d) {
- size_t div = tmp;
- for (; (gblWorkSize[d] % div) != 0; div--)
- ;
- lclWorkSize[d] = div;
- tmp /= div;
- }
-
- // Assuming DWORD access
- const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
-
- // Check if partial dispatch is enabled and
- if (dev().settings().partialDispatch_ &&
- // we couldn't find optimal workload
- (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
- // or size is too small for the cache line
- (lclWorkSize[0] < cacheLineMatch))) {
- size_t maxSize = 0;
- size_t maxDim = 0;
- for (uint d = 0; d < workDim; ++d) {
- if (maxSize < gblWorkSize[d]) {
- maxSize = gblWorkSize[d];
- maxDim = d;
- }
- }
- // Use X dimension as high priority. Runtime will assume that
- // X dimension is more important for the address calculation
- if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
- lclWorkSize[0] = cacheLineMatch;
- thrPerGrp /= cacheLineMatch;
- lclWorkSize[maxDim] = thrPerGrp;
- for (uint d = 1; d < workDim; ++d) {
- if (d != maxDim) {
- lclWorkSize[d] = 1;
- }
- }
- }
- else {
- // Check if a local workgroup has the most optimal size
- if (thrPerGrp > maxSize) {
- thrPerGrp = maxSize;
- }
- lclWorkSize[maxDim] = thrPerGrp;
- for (uint d = 0; d < workDim; ++d) {
- if (d != maxDim) {
- lclWorkSize[d] = 1;
- }
- }
- }
- }
- }
- } else {
- // Use overrides when app doesn't provide workgroup dimensions
- if (workDim == 1) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
- } else if (workDim == 2) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
- lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
- } else if (workDim == 3) {
- lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
- lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
- lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
- } else {
- assert(0 && "Invalid workDim!");
- }
- }
- }
- } else {
- for (uint d = 0; d < workDim; ++d) {
- lclWorkSize[d] = workGroupInfo()->compileSize_[d];
- }
- }
-}
-
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
@@ -450,7 +334,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
- findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+ FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
constexpr uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp
index 525bc517e3..43fc6ff185 100644
--- a/rocclr/runtime/device/pal/palkernel.hpp
+++ b/rocclr/runtime/device/pal/palkernel.hpp
@@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel {
//! Returns spill reg size per workitem
int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); }
- //! Finds local workgroup size
- void findLocalWorkSize(size_t workDim, //!< Work dimension
- const amd::NDRange& gblWorkSize, //!< Global work size
- amd::NDRange& lclWorkSize //!< Local work size
- ) const;
-
//! Returns AQL packet in CPU memory
//! if the kernel arguments were successfully loaded, otherwise NULL
hsa_kernel_dispatch_packet_t* loadArguments(
@@ -127,7 +121,6 @@ class HSAILKernel : public device::Kernel {
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
- const NullDevice& dev_; //!< GPU device object
const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp
index 227758da2c..a63c9f9767 100644
--- a/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/rocclr/runtime/device/rocm/rockernel.cpp
@@ -15,7 +15,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
- : device::Kernel(name),
+ : device::Kernel(prog->dev(), name),
program_(prog),
kernelCodeHandle_(kernelCodeHandle),
workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp
index 039016781d..31e235eeb1 100644
--- a/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -43,16 +43,7 @@ Settings::Settings() {
nonCoherentMode = getenv("OPENCL_USE_NC_MEMORY_POLICY");
enableNCMode_ = (nonCoherentMode) ? true : false;
- // Determine if user wishes to disable support for
- // partial dispatch. By default support for partial
- // dispatch is enabled. Users can turn it off for
- // devices that do not support this feature.
- //
- // @note Update appropriate field of device::Settings
- char* partialDispatch = nullptr;
- partialDispatch = getenv("OPENCL_DISABLE_PARTIAL_DISPATCH");
- enablePartialDispatch_ = (partialDispatch) ? false : true;
- partialDispatch_ = (partialDispatch) ? false : true;
+ partialDispatch_ = GPU_PARTIAL_DISPATCH;
commandQueues_ = 100; //!< Field value set to maximum number
//!< concurrent Virtual GPUs for ROCm backend
diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp
index d7d6dd11ef..4462907694 100644
--- a/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -24,13 +24,12 @@ class Settings : public device::Settings {
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
- uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint singleFpDenorm_ : 1; //!< Support Single FP Denorm
uint apuSystem_ : 1; //!< APU system
- uint reserved_ : 21;
+ uint reserved_ : 22;
};
uint value_;
};
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index f9dae19ac1..4af1354d6e 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1764,132 +1764,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
profilingEnd(vcmd);
}
-// Over rides the workgroup size fields in the packet with runtime/compiler set sizes
-void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
- amd::NDRangeContainer sizes, device::Kernel* devKernel,
- const roc::Device& dev) {
-
- Kernel& gpuKernel = static_cast(*devKernel);
- const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
-
- // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
- // else this error check is incorrect
- if (compile_size[0] || compile_size[1] || compile_size[2]) {
- dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? compile_size[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? compile_size[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? compile_size[2] : 1;
- } else {
- size_t thrPerGrp;
- bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
- bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
- bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
- !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
-
- bool overrideSet = ((sizes.dimensions() == 1) && b1DOverrideSet) ||
- ((sizes.dimensions() == 2) && b2DOverrideSet) ||
- ((sizes.dimensions() == 3) && b3DOverrideSet);
- if (!overrideSet) {
- // Find threads per group
- thrPerGrp = devKernel->workGroupInfo()->size_;
-
- if (gpuKernel.imageEnable() &&
- // and thread group is a multiple value of wavefronts
- ((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
- // and it's 2 or 3-dimensional workload
- (sizes.dimensions() > 1) &&
- ((dev.settings().partialDispatch_) ||
- (((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
- // Use 8x8 workgroup size if kernel has image writes)
- if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
- sizes.local()[0] = 8;
- sizes.local()[1] = 8;
- }
- else {
- sizes.local()[0] = 16;
- sizes.local()[1] = 16;
- }
- if (sizes.dimensions() == 3) {
- sizes.local()[2] = 1;
- }
- }
- else {
- size_t tmp = thrPerGrp;
- // Split the local workgroup into the most efficient way
- for (uint d = 0; d < sizes.dimensions(); ++d) {
- size_t div = tmp;
- for (; (sizes.global()[d] % div) != 0; div--)
- ;
- sizes.local()[d] = div;
- tmp /= div;
- }
-
- // Assuming DWORD access
- const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
-
- // Check if partial dispatch is enabled and
- if (dev.settings().partialDispatch_ &&
- // we couldn't find optimal workload
- ((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
- // or size is too small for the cache line
- (sizes.local()[0] < cacheLineMatch))) {
- size_t maxSize = 0;
- size_t maxDim = 0;
- for (uint d = 0; d < sizes.dimensions(); ++d) {
- if (maxSize < sizes.global()[d]) {
- maxSize = sizes.global()[d];
- maxDim = d;
- }
- }
-
- if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
- sizes.local()[0] = cacheLineMatch;
- thrPerGrp /= cacheLineMatch;
- sizes.local()[maxDim] = thrPerGrp;
- for (uint d = 1; d < sizes.dimensions(); ++d) {
- if (d != maxDim) {
- sizes.local()[d] = 1;
- }
- }
- }
- else {
- // Check if a local workgroup has the most optimal size
- if (thrPerGrp > maxSize) {
- thrPerGrp = maxSize;
- }
- sizes.local()[maxDim] = thrPerGrp;
- for (uint d = 0; d < sizes.dimensions(); ++d) {
- if (d != maxDim) {
- sizes.local()[d] = 1;
- }
- }
- }
- }
- }
- dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1;
- } else {
- // Runtime must set the group size
- dispatchPacket.workgroup_size_x = 1;
- dispatchPacket.workgroup_size_y = 1;
- dispatchPacket.workgroup_size_z = 1;
-
- if (sizes.dimensions() == 1) {
- dispatchPacket.workgroup_size_x = dev.settings().preferredWorkGroupSize_;
- } else if (sizes.dimensions() == 2) {
- dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_;
- dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_;
- } else if (sizes.dimensions() == 3) {
- dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_;
- dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_;
- dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_;
- }
- }
- }
-}
-
bool VirtualGPU::createSchedulerParam()
{
if (nullptr != schedulerParam_) {
@@ -2235,15 +2109,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1;
- if (sizes.local().product() != 0) {
- dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1;
- } else {
- amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
- &(const_cast(sizes).local()[0]));
- setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, devKernel, dev());
- }
+ amd::NDRange local(sizes.local());
+ devKernel->FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+ dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? local[0] : 1;
+ dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? local[1] : 1;
+ dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
+
dispatchPacket.kernarg_address = argBuffer;
dispatchPacket.group_segment_size = ldsUsage;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;