P4 to Git Change 1599472 by gandryey@gera-w8 on 2018/08/29 12:25:34

SWDEV-79445 - OCL generic changes and code clean-up
	- Move FindLocalWorkSize() logic to the abstraction layer
	- Replace the ROCr path with the common FindLocalWorkSize() functionality

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#227 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#314 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#330 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#67 edit
This commit is contained in:
foreman
2018-08-29 12:35:08 -04:00
parent 3f02c35aea
commit 5ee211e801
12 changed files with 216 additions and 408 deletions
+153 -26
View File
@@ -3,6 +3,7 @@
//
#include "platform/runtime.hpp"
#include "platform/program.hpp"
#include "platform/ndrange.hpp"
#include "devkernel.hpp"
#include "utils/macros.hpp"
#include "utils/options.hpp"
@@ -22,6 +23,7 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
namespace device {
// ================================================================================================
bool Kernel::createSignature(
const parameters_t& params, uint32_t numParameters,
uint32_t version) {
@@ -63,45 +65,139 @@ bool Kernel::createSignature(
return false;
}
// ================================================================================================
Kernel::~Kernel() { delete signature_; }
// ================================================================================================
std::string Kernel::openclMangledName(const std::string& name) {
const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
assert(bifSym && "symbol not found");
return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
}
void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
const amd::Coord3D region, uint mapFlags, bool entire,
amd::Image* baseMip) {
// Map/Unmap must be serialized.
amd::ScopedLock lock(owner()->lockMemoryOps());
// ================================================================================================
void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const {
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
WriteMapInfo info = {};
WriteMapInfo* pInfo = &info;
auto it = writeMapInfo_.find(mapAddress);
if (it != writeMapInfo_.end()) {
LogWarning("Double map of the same or overlapped region!");
pInfo = &it->second;
}
bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
pInfo->origin_ = origin;
pInfo->region_ = region;
pInfo->entire_ = entire;
pInfo->unmapWrite_ = true;
}
if (mapFlags & CL_MAP_READ) {
pInfo->unmapRead_ = true;
}
pInfo->baseMip_ = baseMip;
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && ((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
// Insert into the map if it's the first region
if (++pInfo->count_ == 1) {
writeMapInfo_.insert({ mapAddress, info });
// Assuming DWORD access
const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2;
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch))) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else {
assert(0 && "Invalid workDim!");
}
}
}
}
else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
using llvm::AMDGPU::HSAMD::AccessQualifier;
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
@@ -145,6 +241,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
}
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
@@ -189,6 +286,7 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
}
#endif
// ================================================================================================
static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
@@ -198,6 +296,7 @@ static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
};
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
uint sizeType;
@@ -274,6 +373,7 @@ static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t si
return T_VOID;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
uint sizeType;
@@ -351,9 +451,12 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
switch (argInfo->type) {
@@ -392,6 +495,7 @@ static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -405,6 +509,8 @@ static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
return 1;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -414,6 +520,7 @@ static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
@@ -429,6 +536,8 @@ static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
return false;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -441,9 +550,12 @@ static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
inline static int GetArgSizeOCL(const aclArgData* argInfo) {
switch (argInfo->type) {
@@ -481,6 +593,7 @@ inline static int GetArgSizeOCL(const aclArgData* argInfo) {
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
@@ -504,6 +617,8 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgM
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
@@ -534,6 +649,7 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::Image) {
@@ -550,6 +666,8 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArg
return CL_KERNEL_ARG_ACCESS_NONE;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_IMAGE) {
@@ -566,6 +684,7 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgDat
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -588,6 +707,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD&
return rv;
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
@@ -618,6 +739,7 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
// Iterate through the arguments and insert into parameterList
@@ -689,6 +811,8 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
// Iterate through the arguments and insert into parameterList
@@ -769,6 +893,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
}
#endif
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
for (auto str : printfInfoStrings) {
@@ -860,6 +985,8 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
}
}
#endif // defined(WITH_LIGHTNING_COMPILER)
// ================================================================================================
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) {
PrintfInfo info;