From 46fa3c4e53881b4bb9b7db5aa65c1571eb02bedd Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 21 Jan 2020 12:36:01 -0600 Subject: [PATCH] P4 to Git Change 2060936 by gandryey@gera-win10 on 2020/01/21 13:28:16 SWDEV-197836 - Drop the use of llvm header files in opencl runtime - Remove usage of llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#77 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#102 edit [ROCm/clr commit: 69884318aca367042b2c87d7161ec04542be6328] --- .../clr/rocclr/runtime/device/devkernel.cpp | 490 ++++++------------ .../clr/rocclr/runtime/device/devkernel.hpp | 290 +++++------ .../clr/rocclr/runtime/device/devprogram.cpp | 6 - .../rocclr/runtime/device/pal/palprogram.cpp | 1 - 4 files changed, 302 insertions(+), 485 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp index aa40224fa3..55429f87f8 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.cpp +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -17,20 +17,20 @@ #include "acl.h" -#if defined(USE_COMGR_LIBRARY) -#include "llvm/Support/AMDGPUMetadata.h" - -typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; - -using llvm::AMDGPU::HSAMD::AccessQualifier; -using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; -using llvm::AMDGPU::HSAMD::ValueKind; -using llvm::AMDGPU::HSAMD::ValueType; -#endif // defined(USE_COMGR_LIBRARY) - namespace device { +// ================================================================================================ +static const clk_value_type_t ClkValueMapType[6][6] = { + {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, + {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16}, + {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16}, + {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16}, + {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16}, + {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16}, +}; + #if defined(USE_COMGR_LIBRARY) +// ================================================================================================ amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, std::string* str) { size_t size = 0; @@ -44,6 +44,27 @@ amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, return status; } +// ================================================================================================ +inline static clk_value_type_t UpdateArgType(uint sizeType, uint numElements) { + switch (numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } +} + +// ================================================================================================ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, const amd_comgr_metadata_node_t value, void *data) { @@ -70,20 +91,20 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, // get the value of the argument field status = getMetaBuf(value, &buf); - KernelArgMD* lcArg = static_cast(data); + amd::KernelParameterDescriptor* lcArg = static_cast(data); switch (itArgField->second) { case ArgField::Name: - lcArg->mName = buf; + lcArg->name_ = buf; break; case ArgField::TypeName: - lcArg->mTypeName = buf; + lcArg->typeName_ = buf; break; case ArgField::Size: - lcArg->mSize = atoi(buf.c_str()); + lcArg->size_= atoi(buf.c_str()); break; case ArgField::Align: - lcArg->mAlign = atoi(buf.c_str()); + lcArg->alignment_ = atoi(buf.c_str()); break; case ArgField::ValueKind: { @@ -91,7 +112,25 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, if (itValueKind == ArgValueKind.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mValueKind = itValueKind->second; + lcArg->info_.oclObject_ = itValueKind->second; + switch (lcArg->info_.oclObject_) { + case amd::KernelParameterDescriptor::MemoryObject: + if (itValueKind->first.compare("DynamicSharedPointer") == 0) { + lcArg->info_.shared_ = true; + } + break; + case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: + case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: + case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: + case amd::KernelParameterDescriptor::HiddenPrintfBuffer: + case amd::KernelParameterDescriptor::HiddenHostcallBuffer: + case amd::KernelParameterDescriptor::HiddenDefaultQueue: + case amd::KernelParameterDescriptor::HiddenCompletionAction: + case amd::KernelParameterDescriptor::HiddenMultiGridSync: + case amd::KernelParameterDescriptor::HiddenNone: + lcArg->info_.hidden_ = true; + break; + } } break; case ArgField::ValueType: @@ -99,12 +138,12 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, auto itValueType = ArgValueType.find(buf); if (itValueType == ArgValueType.end()) { return AMD_COMGR_STATUS_ERROR; - } - lcArg->mValueType = itValueType->second; + } + lcArg->type_ = UpdateArgType(itValueType->second.first, itValueType->second.second); } break; case ArgField::PointeeAlign: - lcArg->mPointeeAlign = atoi(buf.c_str()); + lcArg->info_.arrayIndex_ = atoi(buf.c_str()); break; case ArgField::AddrSpaceQual: { @@ -112,7 +151,7 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, if (itAddrSpaceQual == ArgAddrSpaceQual.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mAddrSpaceQual = itAddrSpaceQual->second; + lcArg->addressQualifier_ = itAddrSpaceQual->second; } break; case ArgField::AccQual: @@ -121,7 +160,9 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, if (itAccQual == ArgAccQual.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mAccQual = itAccQual->second; + lcArg->accessQualifier_ = itAccQual->second; + lcArg->info_.readOnly_ = + (lcArg->accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; } break; case ArgField::ActualAccQual: @@ -130,20 +171,20 @@ static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, if (itAccQual == ArgAccQual.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mActualAccQual = itAccQual->second; + // lcArg->mActualAccQual = itAccQual->second; } break; case ArgField::IsConst: - lcArg->mIsConst = (buf.compare("true") == 0); + lcArg->typeQualifier_ |= (buf.compare("true") == 0) ? CL_KERNEL_ARG_TYPE_CONST : 0; break; case ArgField::IsRestrict: - lcArg->mIsRestrict = (buf.compare("true") == 0); + lcArg->typeQualifier_ |= (buf.compare("true") == 0) ? CL_KERNEL_ARG_TYPE_RESTRICT : 0; break; case ArgField::IsVolatile: - lcArg->mIsVolatile = (buf.compare("true") == 0); + lcArg->typeQualifier_ |= (buf.compare("true") == 0) ? CL_KERNEL_ARG_TYPE_VOLATILE : 0; break; case ArgField::IsPipe: - lcArg->mIsPipe = (buf.compare("true") == 0); + lcArg->typeQualifier_ |= (buf.compare("true") == 0) ? CL_KERNEL_ARG_TYPE_PIPE : 0; break; default: return AMD_COMGR_STATUS_ERROR; @@ -328,20 +369,20 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, // get the value of the argument field status = getMetaBuf(value, &buf); - KernelArgMD* lcArg = static_cast(data); + amd::KernelParameterDescriptor* lcArg = static_cast(data); switch (itArgField->second) { case ArgField::Name: - lcArg->mName = buf; + lcArg->name_ = buf; break; case ArgField::TypeName: - lcArg->mTypeName = buf; + lcArg->typeName_ = buf; break; case ArgField::Size: - lcArg->mSize = atoi(buf.c_str()); + lcArg->size_ = atoi(buf.c_str()); break; case ArgField::Offset: - lcArg->mOffset = atoi(buf.c_str()); + lcArg->offset_ = atoi(buf.c_str()); break; case ArgField::ValueKind: { @@ -349,7 +390,25 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, if (itValueKind == ArgValueKindV3.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mValueKind = itValueKind->second; + lcArg->info_.oclObject_ = itValueKind->second; + switch (lcArg->info_.oclObject_) { + case amd::KernelParameterDescriptor::MemoryObject: + if (itValueKind->first.compare("dynamic_shared_pointer") == 0) { + lcArg->info_.shared_ = true; + } + break; + case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: + case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: + case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: + case amd::KernelParameterDescriptor::HiddenPrintfBuffer: + case amd::KernelParameterDescriptor::HiddenHostcallBuffer: + case amd::KernelParameterDescriptor::HiddenDefaultQueue: + case amd::KernelParameterDescriptor::HiddenCompletionAction: + case amd::KernelParameterDescriptor::HiddenMultiGridSync: + case amd::KernelParameterDescriptor::HiddenNone: + lcArg->info_.hidden_ = true; + break; + } } break; case ArgField::ValueType: @@ -357,12 +416,12 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, auto itValueType = ArgValueTypeV3.find(buf); if (itValueType == ArgValueTypeV3.end()) { return AMD_COMGR_STATUS_ERROR; - } - lcArg->mValueType = itValueType->second; + } + lcArg->type_ = UpdateArgType(itValueType->second.first, itValueType->second.second); } break; case ArgField::PointeeAlign: - lcArg->mPointeeAlign = atoi(buf.c_str()); + lcArg->info_.arrayIndex_ = atoi(buf.c_str()); break; case ArgField::AddrSpaceQual: { @@ -370,7 +429,7 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mAddrSpaceQual = itAddrSpaceQual->second; + lcArg->addressQualifier_ = itAddrSpaceQual->second; } break; case ArgField::AccQual: @@ -379,7 +438,9 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, if (itAccQual == ArgAccQualV3.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mAccQual = itAccQual->second; + lcArg->accessQualifier_ = itAccQual->second; + lcArg->info_.readOnly_ = + (lcArg->accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; } break; case ArgField::ActualAccQual: @@ -388,20 +449,20 @@ static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, if (itAccQual == ArgAccQualV3.end()) { return AMD_COMGR_STATUS_ERROR; } - lcArg->mActualAccQual = itAccQual->second; + //lcArg->mActualAccQual = itAccQual->second; } break; case ArgField::IsConst: - lcArg->mIsConst = (buf.compare("1") == 0); + lcArg->typeQualifier_ |= (buf.compare("1") == 0) ? CL_KERNEL_ARG_TYPE_CONST : 0; break; case ArgField::IsRestrict: - lcArg->mIsRestrict = (buf.compare("1") == 0); + lcArg->typeQualifier_ |= (buf.compare("1") == 0) ? CL_KERNEL_ARG_TYPE_RESTRICT : 0; break; case ArgField::IsVolatile: - lcArg->mIsVolatile = (buf.compare("1") == 0); + lcArg->typeQualifier_ |= (buf.compare("1") == 0) ? CL_KERNEL_ARG_TYPE_VOLATILE : 0; break; case ArgField::IsPipe: - lcArg->mIsPipe = (buf.compare("1") == 0); + lcArg->typeQualifier_ |= (buf.compare("1") == 0) ? CL_KERNEL_ARG_TYPE_PIPE : 0; break; default: return AMD_COMGR_STATUS_ERROR; @@ -718,53 +779,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, } } } -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) { - switch (lcArg.mValueKind) { - case ValueKind::GlobalBuffer: - case ValueKind::DynamicSharedPointer: - case ValueKind::Pipe: - return amd::KernelParameterDescriptor::MemoryObject; - case ValueKind::ByValue: - return amd::KernelParameterDescriptor::ValueObject; - case ValueKind::Image: - return amd::KernelParameterDescriptor::ImageObject; - case ValueKind::Sampler: - return amd::KernelParameterDescriptor::SamplerObject; - case ValueKind::Queue: - return amd::KernelParameterDescriptor::QueueObject; - case ValueKind::HiddenGlobalOffsetX: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; - case ValueKind::HiddenGlobalOffsetY: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; - case ValueKind::HiddenGlobalOffsetZ: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; - case ValueKind::HiddenPrintfBuffer: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenPrintfBuffer; - case ValueKind::HiddenHostcallBuffer: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenHostcallBuffer; - case ValueKind::HiddenDefaultQueue: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenDefaultQueue; - case ValueKind::HiddenCompletionAction: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenCompletionAction; - case ValueKind::HiddenMultiGridSyncArg: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenMultiGridSync; - case ValueKind::HiddenNone: - default: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenNone; - } -} -#endif + // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) { @@ -813,95 +828,6 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is } #endif -// ================================================================================================ -static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, -}; - -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) { - uint sizeType; - uint numElements; - - if (lcArg.mValueKind != ValueKind::ByValue) { - switch (lcArg.mValueKind) { - case ValueKind::GlobalBuffer: - case ValueKind::DynamicSharedPointer: - case ValueKind::Pipe: - case ValueKind::Image: - return T_POINTER; - case ValueKind::Sampler: - return T_SAMPLER; - case ValueKind::Queue: - return T_QUEUE; - default: - return T_VOID; - } - } - else { - switch (lcArg.mValueType) { - case ValueType::I8: - case ValueType::U8: - sizeType = 0; - numElements = size; - break; - case ValueType::I16: - case ValueType::U16: - sizeType = 1; - numElements = size / 2; - break; - case ValueType::I32: - case ValueType::U32: - sizeType = 2; - numElements = size / 4; - break; - case ValueType::I64: - case ValueType::U64: - sizeType = 3; - numElements = size / 8; - break; - case ValueType::F16: - sizeType = 4; - numElements = size / 2; - break; - case ValueType::F32: - sizeType = 4; - numElements = size / 4; - break; - case ValueType::F64: - sizeType = 5; - numElements = size / 8; - break; - case ValueType::Struct: - default: - return T_VOID; - } - switch (numElements) { - case 1: - return ClkValueMapType[sizeType][0]; - case 2: - return ClkValueMapType[sizeType][1]; - case 3: - return ClkValueMapType[sizeType][2]; - case 4: - return ClkValueMapType[sizeType][3]; - case 8: - return ClkValueMapType[sizeType][4]; - case 16: - return ClkValueMapType[sizeType][5]; - default: - return T_VOID; - } - } - return T_VOID; -} -#endif // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) { @@ -980,13 +906,6 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; } - -static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; } -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { @@ -1026,21 +945,6 @@ static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - uint32_t align = lcArg.mPointeeAlign; - if (align == 0) { - LogWarning("Missing DynamicSharedPointer alignment"); - align = 128; /* worst case alignment */ - } - return align; - } - return 1; -} -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { @@ -1051,23 +955,6 @@ static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) { - if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return true; - case AccessQualifier::WriteOnly: - case AccessQualifier::ReadWrite: - default: - return false; - } - } - return false; -} -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { @@ -1081,11 +968,6 @@ static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; } -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) inline static int GetArgSizeOCL(const aclArgData* argInfo) { @@ -1124,31 +1006,6 @@ inline static int GetArgSizeOCL(const aclArgData* argInfo) { } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - return CL_KERNEL_ARG_ADDRESS_LOCAL; - } - else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { - if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global || - lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - } - LogError("Unsupported address type"); - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - // default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) { @@ -1180,24 +1037,6 @@ static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::Image) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case AccessQualifier::WriteOnly: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case AccessQualifier::ReadWrite: - default: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) { @@ -1215,30 +1054,6 @@ static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgDat } #endif -// ================================================================================================ -#if defined(USE_COMGR_LIBRARY) -static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) { - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - if (lcArg.mIsVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (lcArg.mIsRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (lcArg.mIsConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - } - else if (lcArg.mIsPipe) { - assert(lcArg.mValueKind == ValueKind::Pipe); - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - return rv; -} -#endif - // ================================================================================================ #if defined(WITH_COMPILER_LIB) static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) { @@ -1444,7 +1259,6 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) { // Iterate through the arguments and insert into parameterList device::Kernel::parameters_t params; device::Kernel::parameters_t hiddenParams; - amd::KernelParameterDescriptor desc; size_t offset = 0; amd_comgr_metadata_node_t argsMeta; @@ -1462,7 +1276,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) { } for (size_t i = 0; i < argsSize; ++i) { - KernelArgMD lcArg; + amd::KernelParameterDescriptor desc = {}; amd_comgr_metadata_node_t argsNode; amd_comgr_metadata_kind_t kind; @@ -1478,7 +1292,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) { status = AMD_COMGR_STATUS_ERROR; } if (status == AMD_COMGR_STATUS_SUCCESS) { - void *data = static_cast(&lcArg); + void *data = static_cast(&desc); if (codeObjectVer() == 2) { status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data); } @@ -1498,50 +1312,72 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) { return; } - size_t size = GetArgSizeOCL(lcArg); - size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0; - bool isHidden = false; - desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden); + // COMGR has unclear/undefined order of the fields filling. + // Correct the types for the abstraciton layer after all fields are available + if (desc.info_.oclObject_ != amd::KernelParameterDescriptor::ValueObject) { + switch (desc.info_.oclObject_) { + case amd::KernelParameterDescriptor::MemoryObject: + case amd::KernelParameterDescriptor::ImageObject: + desc.type_ = T_POINTER; + if (desc.info_.shared_) { + if (desc.info_.arrayIndex_ == 0) { + LogWarning("Missing DynamicSharedPointer alignment"); + desc.info_.arrayIndex_ = 128; /* worst case alignment */ + } + } else { + desc.info_.arrayIndex_ = 1; + } + break; + case amd::KernelParameterDescriptor::SamplerObject: + desc.type_ = T_SAMPLER; + desc.addressQualifier_ = CL_KERNEL_ARG_ADDRESS_PRIVATE; + break; + case amd::KernelParameterDescriptor::QueueObject: + desc.type_ = T_QUEUE; + break; + default: + desc.type_ = T_VOID; + break; + } + } + + // LC doesn't report correct address qualifier for images and pipes, + // hence overwrite it + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.typeQualifier_ & CL_KERNEL_ARG_TYPE_PIPE)) { + desc.addressQualifier_ = CL_KERNEL_ARG_ADDRESS_GLOBAL; + + } + size_t size = desc.size_; // Allocate the hidden arguments, but abstraction layer will skip them - if (isHidden) { + if (desc.info_.hidden_) { if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) { setDynamicParallelFlag(true); } - offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg); - desc.offset_ = offset; - desc.size_ = size; - offset += size; + if (codeObjectVer() == 2) { + desc.offset_ = amd::alignUp(offset, desc.alignment_); + offset += size; + } hiddenParams.push_back(desc); continue; } - - desc.name_ = lcArg.mName.c_str(); - desc.type_ = GetOclTypeOCL(lcArg, size); - desc.typeName_ = lcArg.mTypeName.c_str(); - - desc.addressQualifier_ = GetOclAddrQualOCL(lcArg); - desc.accessQualifier_ = GetOclAccessQualOCL(lcArg); - desc.typeQualifier_ = GetOclTypeQualOCL(lcArg); - desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg); - desc.size_ = size; - + // These objects have forced data size to uint64_t - if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { - offset = amd::alignUp(offset, sizeof(uint64_t)); - desc.offset_ = offset; - offset += sizeof(uint64_t); + if (codeObjectVer() == 2) { + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } + else { + offset = amd::alignUp(offset, desc.alignment_); + desc.offset_ = offset; + offset += size; + } } - else { - offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg); - desc.offset_ = offset; - offset += size; - } - - // Update read only flag - desc.info_.readOnly_ = GetReadOnlyOCL(lcArg); params.push_back(desc); diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp index c9139762d7..bb8ed716aa 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.hpp +++ b/projects/clr/rocclr/runtime/device/devkernel.hpp @@ -9,6 +9,60 @@ #include "platform/memory.hpp" #include "devwavelimiter.hpp" +namespace amd { +class Device; +class KernelSignature; +class NDRange; + +struct KernelParameterDescriptor { + enum { + Value = 0, + HiddenNone = 1, + HiddenGlobalOffsetX = 2, + HiddenGlobalOffsetY = 3, + HiddenGlobalOffsetZ = 4, + HiddenPrintfBuffer = 5, + HiddenDefaultQueue = 6, + HiddenCompletionAction = 7, + MemoryObject = 8, + ReferenceObject = 9, + ValueObject = 10, + ImageObject = 11, + SamplerObject = 12, + QueueObject = 13, + HiddenMultiGridSync = 14, + HiddenHostcallBuffer = 15, + }; + clk_value_type_t type_; //!< The parameter's type + size_t offset_; //!< Its offset in the parameter's stack + size_t size_; //!< Its size in bytes + union InfoData { + struct { + uint32_t oclObject_ : 4; //!< OCL object type + uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only + uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA + uint32_t defined_ : 1; //!< The argument was defined by the app + uint32_t hidden_ : 1; //!< It's a hidden argument + uint32_t shared_ : 1; //!< Dynamic shared memory + uint32_t reserved_ : 3; //!< Reserved + uint32_t arrayIndex_ : 20; //!< Index in the objects array or LDS alignment + }; + uint32_t allValues_; + InfoData() : allValues_(0) {} + } info_; + + cl_kernel_arg_address_qualifier addressQualifier_ = + CL_KERNEL_ARG_ADDRESS_PRIVATE; //!< Argument's address qualifier + cl_kernel_arg_access_qualifier accessQualifier_ = + CL_KERNEL_ARG_ACCESS_NONE; //!< Argument's access qualifier + cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier + + std::string name_; //!< The parameter's name in the source + std::string typeName_; //!< Argument's type name + uint32_t alignment_; //!< Argument's alignment +}; +} + #if defined(USE_COMGR_LIBRARY) namespace llvm { namespace AMDGPU { @@ -27,12 +81,6 @@ struct RuntimeHandle { #include "amd_comgr.h" #include "llvm/Support/AMDGPUMetadata.h" -typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; - -using llvm::AMDGPU::HSAMD::AccessQualifier; -using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; -using llvm::AMDGPU::HSAMD::ValueKind; -using llvm::AMDGPU::HSAMD::ValueType; // for Code Object V3 enum class ArgField : uint8_t { @@ -76,7 +124,7 @@ enum class CodePropField : uint8_t { }; -static const std::map ArgFieldMap = +static const std::map ArgFieldMap = { {"Name", ArgField::Name}, {"TypeName", ArgField::TypeName}, @@ -94,58 +142,54 @@ static const std::map ArgFieldMap = {"IsPipe", ArgField::IsPipe} }; -static const std::map ArgValueKind = -{ - {"ByValue", ValueKind::ByValue}, - {"GlobalBuffer", ValueKind::GlobalBuffer}, - {"DynamicSharedPointer", ValueKind::DynamicSharedPointer}, - {"Sampler", ValueKind::Sampler}, - {"Image", ValueKind::Image}, - {"Pipe", ValueKind::Pipe}, - {"Queue", ValueKind::Queue}, - {"HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX}, - {"HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY}, - {"HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ}, - {"HiddenNone", ValueKind::HiddenNone}, - {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, - {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, - {"HiddenCompletionAction", ValueKind::HiddenCompletionAction}, - {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}, - {"HiddenHostcallBuffer", ValueKind::HiddenHostcallBuffer}, +static const std::map ArgValueKind = { + {"ByValue", amd::KernelParameterDescriptor::ValueObject}, + {"GlobalBuffer", amd::KernelParameterDescriptor::MemoryObject}, + {"DynamicSharedPointer", amd::KernelParameterDescriptor::MemoryObject}, + {"Sampler", amd::KernelParameterDescriptor::SamplerObject}, + {"Image", amd::KernelParameterDescriptor::ImageObject }, + {"Pipe", amd::KernelParameterDescriptor::MemoryObject}, + {"Queue", amd::KernelParameterDescriptor::QueueObject}, + {"HiddenGlobalOffsetX", amd::KernelParameterDescriptor::HiddenGlobalOffsetX}, + {"HiddenGlobalOffsetY", amd::KernelParameterDescriptor::HiddenGlobalOffsetY}, + {"HiddenGlobalOffsetZ", amd::KernelParameterDescriptor::HiddenGlobalOffsetZ}, + {"HiddenNone", amd::KernelParameterDescriptor::HiddenNone}, + {"HiddenPrintfBuffer", amd::KernelParameterDescriptor::HiddenPrintfBuffer}, + {"HiddenDefaultQueue", amd::KernelParameterDescriptor::HiddenDefaultQueue}, + {"HiddenCompletionAction", amd::KernelParameterDescriptor::HiddenCompletionAction}, + {"HiddenMultigridSyncArg", amd::KernelParameterDescriptor::HiddenMultiGridSync}, + {"HiddenHostcallBuffer", amd::KernelParameterDescriptor::HiddenHostcallBuffer} }; -static const std::map ArgValueType = -{ - {"Struct", ValueType::Struct}, - {"I8", ValueType::I8}, - {"U8", ValueType::U8}, - {"I16", ValueType::I16}, - {"U16", ValueType::U16}, - {"F16", ValueType::F16}, - {"I32", ValueType::I32}, - {"U32", ValueType::U32}, - {"F32", ValueType::F32}, - {"I64", ValueType::I64}, - {"U64", ValueType::U64}, - {"F64", ValueType::F64} +static const std::map> ArgValueType = { + {"Struct", {0, 0}}, + {"I8", {0, 1}}, + {"U8", {0, 1}}, + {"I16", {1, 2}}, + {"U16", {1, 2}}, + {"F16", {4, 2}}, + {"I32", {2, 4}}, + {"U32", {2, 4}}, + {"F32", {4, 4}}, + {"I64", {3, 8}}, + {"U64", {3, 8}}, + {"F64", {5, 8}} }; -static const std::map ArgAccQual = -{ - {"Default", AccessQualifier::Default}, - {"ReadOnly", AccessQualifier::ReadOnly}, - {"WriteOnly", AccessQualifier::WriteOnly}, - {"ReadWrite", AccessQualifier::ReadWrite} +static const std::map ArgAccQual = { + {"Default", CL_KERNEL_ARG_ACCESS_NONE}, + {"ReadOnly", CL_KERNEL_ARG_ACCESS_READ_ONLY}, + {"WriteOnly", CL_KERNEL_ARG_ACCESS_WRITE_ONLY}, + {"ReadWrite", CL_KERNEL_ARG_ACCESS_READ_WRITE} }; -static const std::map ArgAddrSpaceQual = -{ - {"Private", AddressSpaceQualifier::Private}, - {"Global", AddressSpaceQualifier::Global}, - {"Constant", AddressSpaceQualifier::Constant}, - {"Local", AddressSpaceQualifier::Local}, - {"Generic", AddressSpaceQualifier::Generic}, - {"Region", AddressSpaceQualifier::Region} +static const std::map ArgAddrSpaceQual = { + {"Private", CL_KERNEL_ARG_ADDRESS_PRIVATE}, + {"Global", CL_KERNEL_ARG_ADDRESS_GLOBAL}, + {"Constant", CL_KERNEL_ARG_ADDRESS_CONSTANT}, + {"Local", CL_KERNEL_ARG_ADDRESS_LOCAL}, + {"Generic", CL_KERNEL_ARG_ADDRESS_GLOBAL}, + {"Region", CL_KERNEL_ARG_ADDRESS_PRIVATE} }; static const std::map AttrFieldMap = @@ -209,58 +253,54 @@ static const std::map ArgFieldMapV3 = {".is_pipe", ArgField::IsPipe} }; -static const std::map ArgValueKindV3 = -{ - {"by_value", ValueKind::ByValue}, - {"global_buffer", ValueKind::GlobalBuffer}, - {"dynamic_shared_pointer", ValueKind::DynamicSharedPointer}, - {"sampler", ValueKind::Sampler}, - {"image", ValueKind::Image}, - {"pipe", ValueKind::Pipe}, - {"queue", ValueKind::Queue}, - {"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX}, - {"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY}, - {"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ}, - {"hidden_none", ValueKind::HiddenNone}, - {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer}, - {"hidden_default_queue", ValueKind::HiddenDefaultQueue}, - {"hidden_completion_action", ValueKind::HiddenCompletionAction}, - {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}, - {"hidden_hostcall_buffer", ValueKind::HiddenHostcallBuffer}, +static const std::map ArgValueKindV3 = { + {"by_value", amd::KernelParameterDescriptor::ValueObject}, + {"global_buffer", amd::KernelParameterDescriptor::MemoryObject}, + {"dynamic_shared_pointer", amd::KernelParameterDescriptor::MemoryObject}, + {"sampler", amd::KernelParameterDescriptor::SamplerObject}, + {"image", amd::KernelParameterDescriptor::ImageObject }, + {"pipe", amd::KernelParameterDescriptor::MemoryObject}, + {"queue", amd::KernelParameterDescriptor::QueueObject}, + {"hidden_global_offset_x", amd::KernelParameterDescriptor::HiddenGlobalOffsetX}, + {"hidden_global_offset_y", amd::KernelParameterDescriptor::HiddenGlobalOffsetY}, + {"hidden_global_offset_z", amd::KernelParameterDescriptor::HiddenGlobalOffsetZ}, + {"hidden_none", amd::KernelParameterDescriptor::HiddenNone}, + {"hidden_printf_buffer", amd::KernelParameterDescriptor::HiddenPrintfBuffer}, + {"hidden_default_queue", amd::KernelParameterDescriptor::HiddenDefaultQueue}, + {"hidden_completion_action", amd::KernelParameterDescriptor::HiddenCompletionAction}, + {"hidden_multigrid_sync_arg", amd::KernelParameterDescriptor::HiddenMultiGridSync}, + {"hidden_hostcall_buffer", amd::KernelParameterDescriptor::HiddenHostcallBuffer} }; -static const std::map ArgValueTypeV3 = -{ - {"struct", ValueType::Struct}, - {"i8", ValueType::I8}, - {"u8", ValueType::U8}, - {"i16", ValueType::I16}, - {"u16", ValueType::U16}, - {"f16", ValueType::F16}, - {"i32", ValueType::I32}, - {"u32", ValueType::U32}, - {"f32", ValueType::F32}, - {"i64", ValueType::I64}, - {"u64", ValueType::U64}, - {"f64", ValueType::F64} +static const std::map> ArgValueTypeV3 = { + {"struct", {0, 0}}, + {"i8", {0, 1}}, + {"u8", {0, 1}}, + {"i16", {1, 2}}, + {"u16", {1, 2}}, + {"f16", {4, 2}}, + {"i32", {2, 4}}, + {"u32", {2, 4}}, + {"f32", {4, 4}}, + {"i64", {3, 8}}, + {"u64", {3, 8}}, + {"f64", {5, 8}} }; -static const std::map ArgAccQualV3 = -{ - {"default", AccessQualifier::Default}, - {"read_only", AccessQualifier::ReadOnly}, - {"write_only", AccessQualifier::WriteOnly}, - {"read_write", AccessQualifier::ReadWrite} +static const std::map ArgAccQualV3 = { + {"default", CL_KERNEL_ARG_ACCESS_NONE}, + {"read_only", CL_KERNEL_ARG_ACCESS_READ_ONLY}, + {"write_only", CL_KERNEL_ARG_ACCESS_WRITE_ONLY}, + {"read_write", CL_KERNEL_ARG_ACCESS_READ_WRITE} }; -static const std::map ArgAddrSpaceQualV3 = -{ - {"private", AddressSpaceQualifier::Private}, - {"global", AddressSpaceQualifier::Global}, - {"constant", AddressSpaceQualifier::Constant}, - {"local", AddressSpaceQualifier::Local}, - {"generic", AddressSpaceQualifier::Generic}, - {"region", AddressSpaceQualifier::Region} +static const std::map ArgAddrSpaceQualV3 = { + {"private", CL_KERNEL_ARG_ADDRESS_PRIVATE}, + {"global", CL_KERNEL_ARG_ADDRESS_GLOBAL}, + {"constant", CL_KERNEL_ARG_ADDRESS_CONSTANT}, + {"local", CL_KERNEL_ARG_ADDRESS_LOCAL}, + {"generic", CL_KERNEL_ARG_ADDRESS_GLOBAL}, + {"region", CL_KERNEL_ARG_ADDRESS_PRIVATE} }; static const std::map KernelFieldMapV3 = @@ -282,7 +322,6 @@ static const std::map KernelFieldMapV3 = {".vgpr_spill_count", KernelField::NumSpilledVGPRs} }; - #endif // defined(USE_COMGR_LIBRARY) namespace amd { @@ -298,57 +337,6 @@ namespace amd { } // hsa } // amd -namespace amd { - -class Device; -class KernelSignature; -class NDRange; - -struct KernelParameterDescriptor { - enum { - Value = 0, - HiddenNone = 1, - HiddenGlobalOffsetX = 2, - HiddenGlobalOffsetY = 3, - HiddenGlobalOffsetZ = 4, - HiddenPrintfBuffer = 5, - HiddenDefaultQueue = 6, - HiddenCompletionAction = 7, - MemoryObject = 8, - ReferenceObject = 9, - ValueObject = 10, - ImageObject = 11, - SamplerObject = 12, - QueueObject = 13, - HiddenMultiGridSync = 14, - HiddenHostcallBuffer = 15, - }; - clk_value_type_t type_; //!< The parameter's type - size_t offset_; //!< Its offset in the parameter's stack - size_t size_; //!< Its size in bytes - union InfoData { - struct { - uint32_t oclObject_ : 4; //!< OCL object type - uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only - uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA - uint32_t defined_ : 1; //!< The argument was defined by the app - uint32_t reserved_ : 1; //!< reserved - uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment - }; - uint32_t allValues_; - InfoData() : allValues_(0) {} - } info_; - - cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier - cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier - cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier - - std::string name_; //!< The parameter's name in the source - std::string typeName_; //!< Argument's type name -}; - -} - namespace device { class Program; diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp index 7a04966cff..645c84b31c 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.cpp +++ b/projects/clr/rocclr/runtime/device/devprogram.cpp @@ -28,12 +28,6 @@ #include "spirv/spirvUtils.h" #include "acl.h" -#if defined(USE_COMGR_LIBRARY) -#include "llvm/Support/AMDGPUMetadata.h" - -typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; -#endif // defined(USE_COMGR_LIBRARY) - #ifdef EARLY_INLINE #define AMDGPU_EARLY_INLINE_ALL_OPTION " -mllvm -amdgpu-early-inline-all" #else diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index c32a692bd3..4a2436dad8 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -18,7 +18,6 @@ #include "hsa_ext_image.h" #include "amd_hsa_loader.hpp" #if defined(USE_COMGR_LIBRARY) -#include "llvm/Support/AMDGPUMetadata.h" #include "gelf.h" #endif // defined(USE_COMGR_LIBRARY)