P4 to Git Change 1505728 by wchau@wchau_OCL_boltzmann on 2018/01/19 13:13:53
SWDEV-137270 - Add findLocalWorkSize to OpenCL runtime ROC device
- Add support for kernel using image argument
- fix the issue of not using the max workgroup size specified by the environment variables "maxWorkGroupSize*"
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#33 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#50 edit
[ROCm/clr commit: 87ad3f9692]
This commit is contained in:
@@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
||||
desc.typeName_ = arg->typeName_.c_str();
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
flags_.imageEnable_ = true;
|
||||
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
|
||||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
|
||||
flags_.imageWrite_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Make a check if it is local or global
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
desc.size_ = 0;
|
||||
@@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
flags_.imageEnable_ = true;
|
||||
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
|
||||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
|
||||
flags_.imageWrite_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Make a check if it is local or global
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
desc.size_ = 0;
|
||||
|
||||
@@ -127,10 +127,18 @@ class Kernel : public device::Kernel {
|
||||
//! set internal kernel flag
|
||||
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
|
||||
|
||||
//! Return TRUE if kernel uses images
|
||||
bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
|
||||
|
||||
//! Return TRUE if kernel wirtes images
|
||||
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
|
||||
|
||||
protected:
|
||||
union Flags {
|
||||
struct {
|
||||
uint internalKernel_ : 1; //!< Is a blit kernel?
|
||||
uint imageEnable_ : 1; //!< Kernel uses images
|
||||
uint imageWrite_ : 1; //!< Kernel writes images
|
||||
};
|
||||
uint value_;
|
||||
Flags() : value_(0) {}
|
||||
|
||||
@@ -142,6 +142,23 @@ void Settings::override() {
|
||||
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
|
||||
}
|
||||
|
||||
if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
|
||||
maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
|
||||
maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
||||
}
|
||||
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
|
||||
maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
|
||||
maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
||||
}
|
||||
if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
|
||||
maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
|
||||
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
|
||||
}
|
||||
|
||||
@@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
|
||||
amd::NDRangeContainer sizes, device::Kernel* devKernel,
|
||||
const roc::Device& dev) {
|
||||
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
|
||||
|
||||
// Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
|
||||
@@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
|
||||
// Find threads per group
|
||||
thrPerGrp = devKernel->workGroupInfo()->size_;
|
||||
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < sizes.dimensions(); ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (sizes.global()[d] % div) != 0; div--)
|
||||
;
|
||||
sizes.local()[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
|
||||
|
||||
// Check if partial dispatch is enabled and
|
||||
if (dev.settings().partialDispatch_ &&
|
||||
// we couldn't find optimal workload
|
||||
((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
|
||||
// or size is too small for the cache line
|
||||
(sizes.local()[0] < cacheLineMatch))) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
for (uint d = 0; d < sizes.dimensions(); ++d) {
|
||||
if (maxSize < sizes.global()[d]) {
|
||||
maxSize = sizes.global()[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
|
||||
if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
|
||||
sizes.local()[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
sizes.local()[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < sizes.dimensions(); ++d) {
|
||||
if (d != maxDim) {
|
||||
sizes.local()[d] = 1;
|
||||
}
|
||||
}
|
||||
if (gpuKernel.imageEnable() &&
|
||||
// and thread group is a multiple value of wavefronts
|
||||
((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
|
||||
// and it's 2 or 3-dimensional workload
|
||||
(sizes.dimensions() > 1) &&
|
||||
((dev.settings().partialDispatch_) ||
|
||||
(((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
|
||||
// Use 8x8 workgroup size if kernel has image writes)
|
||||
if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
|
||||
sizes.local()[0] = 8;
|
||||
sizes.local()[1] = 8;
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
sizes.local()[maxDim] = thrPerGrp;
|
||||
sizes.local()[0] = 16;
|
||||
sizes.local()[1] = 16;
|
||||
}
|
||||
if (sizes.dimensions() == 3) {
|
||||
sizes.local()[2] = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
size_t tmp = thrPerGrp;
|
||||
// Split the local workgroup into the most efficient way
|
||||
for (uint d = 0; d < sizes.dimensions(); ++d) {
|
||||
size_t div = tmp;
|
||||
for (; (sizes.global()[d] % div) != 0; div--)
|
||||
;
|
||||
sizes.local()[d] = div;
|
||||
tmp /= div;
|
||||
}
|
||||
|
||||
// Assuming DWORD access
|
||||
const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
|
||||
|
||||
// Check if partial dispatch is enabled and
|
||||
if (dev.settings().partialDispatch_ &&
|
||||
// we couldn't find optimal workload
|
||||
((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
|
||||
// or size is too small for the cache line
|
||||
(sizes.local()[0] < cacheLineMatch))) {
|
||||
size_t maxSize = 0;
|
||||
size_t maxDim = 0;
|
||||
for (uint d = 0; d < sizes.dimensions(); ++d) {
|
||||
if (d != maxDim) {
|
||||
sizes.local()[d] = 1;
|
||||
if (maxSize < sizes.global()[d]) {
|
||||
maxSize = sizes.global()[d];
|
||||
maxDim = d;
|
||||
}
|
||||
}
|
||||
|
||||
if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
|
||||
sizes.local()[0] = cacheLineMatch;
|
||||
thrPerGrp /= cacheLineMatch;
|
||||
sizes.local()[maxDim] = thrPerGrp;
|
||||
for (uint d = 1; d < sizes.dimensions(); ++d) {
|
||||
if (d != maxDim) {
|
||||
sizes.local()[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Check if a local workgroup has the most optimal size
|
||||
if (thrPerGrp > maxSize) {
|
||||
thrPerGrp = maxSize;
|
||||
}
|
||||
sizes.local()[maxDim] = thrPerGrp;
|
||||
for (uint d = 0; d < sizes.dimensions(); ++d) {
|
||||
if (d != maxDim) {
|
||||
sizes.local()[d] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
مرجع در شماره جدید
Block a user