P4 to Git Change 1505728 by wchau@wchau_OCL_boltzmann on 2018/01/19 13:13:53

SWDEV-137270 - Add findLocalWorkSize to OpenCL runtime ROC device
	- Add support for kernel using image argument
	- fix the issue of not using the max workgroup size specified by the environment variables "maxWorkGroupSize*"

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#33 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#50 edit


[ROCm/clr commit: 87ad3f9692]
This commit is contained in:
foreman
2018-01-19 13:33:55 -05:00
والد 09a507e8d0
کامیت 2fd27167df
4فایلهای تغییر یافته به همراه110 افزوده شده و 44 حذف شده
@@ -545,6 +545,15 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arg->typeName_.c_str();
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
flags_.imageEnable_ = true;
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
flags_.imageWrite_ = true;
}
}
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
@@ -615,6 +624,15 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
flags_.imageEnable_ = true;
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
flags_.imageWrite_ = true;
}
}
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
@@ -127,10 +127,18 @@ class Kernel : public device::Kernel {
//! set internal kernel flag
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
//! Return TRUE if kernel uses images
bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
//! Return TRUE if kernel wirtes images
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
protected:
union Flags {
struct {
uint internalKernel_ : 1; //!< Is a blit kernel?
uint imageEnable_ : 1; //!< Kernel uses images
uint imageWrite_ : 1; //!< Kernel writes images
};
uint value_;
Flags() : value_(0) {}
@@ -142,6 +142,23 @@ void Settings::override() {
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
}
if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
}
if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
}
@@ -1401,6 +1401,7 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
amd::NDRangeContainer sizes, device::Kernel* devKernel,
const roc::Device& dev) {
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
// Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
@@ -1425,53 +1426,75 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
// Find threads per group
thrPerGrp = devKernel->workGroupInfo()->size_;
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < sizes.dimensions(); ++d) {
size_t div = tmp;
for (; (sizes.global()[d] % div) != 0; div--)
;
sizes.local()[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
// Check if partial dispatch is enabled and
if (dev.settings().partialDispatch_ &&
// we couldn't find optimal workload
((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
// or size is too small for the cache line
(sizes.local()[0] < cacheLineMatch))) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < sizes.dimensions(); ++d) {
if (maxSize < sizes.global()[d]) {
maxSize = sizes.global()[d];
maxDim = d;
}
}
if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
sizes.local()[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
sizes.local()[maxDim] = thrPerGrp;
for (uint d = 1; d < sizes.dimensions(); ++d) {
if (d != maxDim) {
sizes.local()[d] = 1;
}
}
if (gpuKernel.imageEnable() &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % devKernel->workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(sizes.dimensions() > 1) &&
((dev.settings().partialDispatch_) ||
(((sizes.global()[0] % 16) == 0) && ((sizes.global()[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes)
if (gpuKernel.imageWrite() || (thrPerGrp != dev.settings().preferredWorkGroupSize_)) {
sizes.local()[0] = 8;
sizes.local()[1] = 8;
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
sizes.local()[maxDim] = thrPerGrp;
sizes.local()[0] = 16;
sizes.local()[1] = 16;
}
if (sizes.dimensions() == 3) {
sizes.local()[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < sizes.dimensions(); ++d) {
size_t div = tmp;
for (; (sizes.global()[d] % div) != 0; div--)
;
sizes.local()[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = dev.info().globalMemCacheLineSize_ >> 2;
// Check if partial dispatch is enabled and
if (dev.settings().partialDispatch_ &&
// we couldn't find optimal workload
((sizes.local().product() % devKernel->workGroupInfo()->wavefrontSize_) != 0 ||
// or size is too small for the cache line
(sizes.local()[0] < cacheLineMatch))) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < sizes.dimensions(); ++d) {
if (d != maxDim) {
sizes.local()[d] = 1;
if (maxSize < sizes.global()[d]) {
maxSize = sizes.global()[d];
maxDim = d;
}
}
if ((maxDim != 0) && (sizes.global()[0] >= (cacheLineMatch / 2))) {
sizes.local()[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
sizes.local()[maxDim] = thrPerGrp;
for (uint d = 1; d < sizes.dimensions(); ++d) {
if (d != maxDim) {
sizes.local()[d] = 1;
}
}
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
sizes.local()[maxDim] = thrPerGrp;
for (uint d = 0; d < sizes.dimensions(); ++d) {
if (d != maxDim) {
sizes.local()[d] = 1;
}
}
}
}