From d1f62b92c100942a7d7222c08ca5abfa724f49e9 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 9 Apr 2019 23:24:10 -0400 Subject: [PATCH] P4 to Git Change 1767752 by wchau@wchau_OCL_Linux on 2019/04/09 22:58:03 SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata - Add support for the V3 code objects Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#336 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#134 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/inc/core/palCmdBuffer.h#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx6/gfx6ComputeCmdBuffer.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx9/gfx9ComputeCmdBuffer.cpp#69 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#77 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#130 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#52 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#103 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#47 edit [ROCm/clr commit: 36a5f2a85f390c8e9ea34e5516584fc1afb3cbe8] --- .../clr/rocclr/runtime/device/devkernel.cpp | 568 +++++++++++++++++- .../clr/rocclr/runtime/device/devkernel.hpp | 394 ++++-------- .../clr/rocclr/runtime/device/devprogram.cpp | 30 +- .../clr/rocclr/runtime/device/devprogram.hpp | 3 + .../rocclr/runtime/device/gpu/gpukernel.cpp | 6 +- .../rocclr/runtime/device/gpu/gpukernel.hpp | 4 +- .../rocclr/runtime/device/pal/palkernel.cpp | 144 +++-- .../rocclr/runtime/device/pal/palkernel.hpp | 25 +- .../rocclr/runtime/device/pal/palprogram.cpp | 21 +- .../rocclr/runtime/device/pal/palsettings.cpp | 30 +- .../rocclr/runtime/device/pal/palsettings.hpp | 3 +- .../rocclr/runtime/device/pal/palvirtual.cpp | 7 + .../rocclr/runtime/device/rocm/rockernel.cpp | 101 ++-- .../rocclr/runtime/device/rocm/rockernel.hpp | 18 +- .../rocclr/runtime/device/rocm/rocprogram.cpp | 14 + 15 files changed, 951 insertions(+), 417 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp index 661c8d1127..761610a12e 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.cpp +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -29,10 +29,487 @@ using llvm::AMDGPU::HSAMD::ValueType; namespace device { +#if defined(USE_COMGR_LIBRARY) +static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + std::string buf; + + // get the key of the argument field + size_t size = 0; + status = amd::Comgr::get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itArgField = ArgFieldMap.find(buf); + if (itArgField == ArgFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + // get the value of the argument field + status = getMetaBuf(value, &buf); + + KernelArgMD* lcArg = static_cast(data); + + switch (itArgField->second) { + case ArgField::Name: + lcArg->mName = buf; + break; + case ArgField::TypeName: + lcArg->mTypeName = buf; + break; + case ArgField::Size: + lcArg->mSize = atoi(buf.c_str()); + break; + case ArgField::Align: + lcArg->mAlign = atoi(buf.c_str()); + break; + case ArgField::ValueKind: + { + auto itValueKind = ArgValueKind.find(buf); + if (itValueKind == ArgValueKind.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueKind = itValueKind->second; + } + break; + case ArgField::ValueType: + { + auto itValueType = ArgValueType.find(buf); + if (itValueType == ArgValueType.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueType = itValueType->second; + } + break; + case ArgField::PointeeAlign: + lcArg->mPointeeAlign = atoi(buf.c_str()); + break; + case ArgField::AddrSpaceQual: + { + auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf); + if (itAddrSpaceQual == ArgAddrSpaceQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAddrSpaceQual = itAddrSpaceQual->second; + } + break; + case ArgField::AccQual: + { + auto itAccQual = ArgAccQual.find(buf); + if (itAccQual == ArgAccQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAccQual = itAccQual->second; + } + break; + case ArgField::ActualAccQual: + { + auto itAccQual = ArgAccQual.find(buf); + if (itAccQual == ArgAccQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mActualAccQual = itAccQual->second; + } + break; + case ArgField::IsConst: + lcArg->mIsConst = (buf.compare("true") == 0); + break; + case ArgField::IsRestrict: + lcArg->mIsRestrict = (buf.compare("true") == 0); + break; + case ArgField::IsVolatile: + lcArg->mIsVolatile = (buf.compare("true") == 0); + break; + case ArgField::IsPipe: + lcArg->mIsPipe = (buf.compare("true") == 0); + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + return AMD_COMGR_STATUS_SUCCESS; +} + +static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + size_t size = 0; + std::string buf; + + // get the key of the argument field + status = amd::Comgr::get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itAttrField = AttrFieldMap.find(buf); + if (itAttrField == AttrFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + KernelMD* kernelMD = static_cast(data); + switch (itAttrField->second) { + case AttrField::ReqdWorkGroupSize: + { + status = amd::Comgr::get_metadata_list_size(value, &size); + if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSize; + status = amd::Comgr::index_list_metadata(value, i, &workgroupSize); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str())); + } + amd::Comgr::destroy_metadata(workgroupSize); + } + } + } + break; + case AttrField::WorkGroupSizeHint: + { + status = amd::Comgr::get_metadata_list_size(value, &size); + if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSizeHint; + status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str())); + } + amd::Comgr::destroy_metadata(workgroupSizeHint); + } + } + } + break; + case AttrField::VecTypeHint: + { + if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mVecTypeHint = buf; + } + } + break; + case AttrField::RuntimeHandle: + { + if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mRuntimeHandle = buf; + } + } + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + + return status; +} + +static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + std::string buf; + + // get the key of the argument field + status = amd::Comgr::get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itCodePropField = CodePropFieldMap.find(buf); + if (itCodePropField == CodePropFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + // get the value of the argument field + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(value, &buf); + } + + KernelMD* kernelMD = static_cast(data); + switch (itCodePropField->second) { + case CodePropField::KernargSegmentSize: + kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); + break; + case CodePropField::GroupSegmentFixedSize: + kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str()); + break; + case CodePropField::PrivateSegmentFixedSize: + kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str()); + break; + case CodePropField::KernargSegmentAlign: + kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str()); + break; + case CodePropField::WavefrontSize: + kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str()); + break; + case CodePropField::NumSGPRs: + kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str()); + break; + case CodePropField::NumVGPRs: + kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str()); + break; + case CodePropField::MaxFlatWorkGroupSize: + kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str()); + break; + case CodePropField::IsDynamicCallStack: + kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0); + break; + case CodePropField::IsXNACKEnabled: + kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0); + break; + case CodePropField::NumSpilledSGPRs: + kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str()); + break; + case CodePropField::NumSpilledVGPRs: + kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str()); + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + return AMD_COMGR_STATUS_SUCCESS; +} + +static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + std::string buf; + + // get the key of the argument field + size_t size = 0; + status = amd::Comgr::get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itArgField = ArgFieldMapV3.find(buf); + if (itArgField == ArgFieldMapV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + // get the value of the argument field + status = getMetaBuf(value, &buf); + + KernelArgMD* lcArg = static_cast(data); + + switch (itArgField->second) { + case ArgField::Name: + lcArg->mName = buf; + break; + case ArgField::TypeName: + lcArg->mTypeName = buf; + break; + case ArgField::Size: + lcArg->mSize = atoi(buf.c_str()); + break; + case ArgField::Offset: + lcArg->mOffset = atoi(buf.c_str()); + break; + case ArgField::ValueKind: + { + auto itValueKind = ArgValueKindV3.find(buf); + if (itValueKind == ArgValueKindV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueKind = itValueKind->second; + } + break; + case ArgField::ValueType: + { + auto itValueType = ArgValueTypeV3.find(buf); + if (itValueType == ArgValueTypeV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueType = itValueType->second; + } + break; + case ArgField::PointeeAlign: + lcArg->mPointeeAlign = atoi(buf.c_str()); + break; + case ArgField::AddrSpaceQual: + { + auto itAddrSpaceQual = ArgAddrSpaceQualV3.find(buf); + if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAddrSpaceQual = itAddrSpaceQual->second; + } + break; + case ArgField::AccQual: + { + auto itAccQual = ArgAccQualV3.find(buf); + if (itAccQual == ArgAccQualV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAccQual = itAccQual->second; + } + break; + case ArgField::ActualAccQual: + { + auto itAccQual = ArgAccQualV3.find(buf); + if (itAccQual == ArgAccQualV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mActualAccQual = itAccQual->second; + } + break; + case ArgField::IsConst: + lcArg->mIsConst = (buf.compare("true") == 0); + break; + case ArgField::IsRestrict: + lcArg->mIsRestrict = (buf.compare("true") == 0); + break; + case ArgField::IsVolatile: + lcArg->mIsVolatile = (buf.compare("true") == 0); + break; + case ArgField::IsPipe: + lcArg->mIsPipe = (buf.compare("true") == 0); + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + return AMD_COMGR_STATUS_SUCCESS; +} + +static amd_comgr_status_t populateKernelMetaV3(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + size_t size = 0; + std::string buf; + + // get the key of the argument field + status = amd::Comgr::get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itKernelField = KernelFieldMapV3.find(buf); + if (itKernelField == KernelFieldMapV3.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + if (itKernelField->second != KernelField::ReqdWorkGroupSize && + itKernelField->second != KernelField::WorkGroupSizeHint) { + status = getMetaBuf(value,&buf); + } + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + KernelMD* kernelMD = static_cast(data); + switch (itKernelField->second) { + case KernelField::ReqdWorkGroupSize: + status = amd::Comgr::get_metadata_list_size(value, &size); + if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSize; + status = amd::Comgr::index_list_metadata(value, i, &workgroupSize); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str())); + } + amd::Comgr::destroy_metadata(workgroupSize); + } + } + break; + case KernelField::WorkGroupSizeHint: + status = amd::Comgr::get_metadata_list_size(value, &size); + if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSizeHint; + status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str())); + } + amd::Comgr::destroy_metadata(workgroupSizeHint); + } + } + break; + case KernelField::VecTypeHint: + kernelMD->mAttrs.mVecTypeHint = buf; + break; + case KernelField::DeviceEnqueueSymbol: + kernelMD->mAttrs.mRuntimeHandle = buf; + break; + case KernelField::KernargSegmentSize: + kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); + break; + case KernelField::GroupSegmentFixedSize: + kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str()); + break; + case KernelField::PrivateSegmentFixedSize: + kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str()); + break; + case KernelField::KernargSegmentAlign: + kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str()); + break; + case KernelField::WavefrontSize: + kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str()); + break; + case KernelField::NumSGPRs: + kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str()); + break; + case KernelField::NumVGPRs: + kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str()); + break; + case KernelField::MaxFlatWorkGroupSize: + kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str()); + break; + case KernelField::NumSpilledSGPRs: + kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str()); + break; + case KernelField::NumSpilledVGPRs: + kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str()); + break; + case KernelField::SymbolName: + kernelMD->mSymbolName = buf; + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + + return status; +} +#endif + // ================================================================================================ -Kernel::Kernel(const amd::Device& dev, const std::string& name) +Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& prog) : dev_(dev) , name_(name) + , prog_(prog) , signature_(nullptr) , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) { // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); @@ -482,6 +959,8 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s // ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; } + static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; } #endif @@ -771,11 +1250,10 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a // ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) -bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode, - const uint32_t kernargSegmentByteSize, - KernelMD* kernelMD) { +bool Kernel::GetAttrCodePropMetadata( const amd_comgr_metadata_node_t kernelMetaNode, + KernelMD* kernelMD) { - InitParameters(kernelMetaNode, kernargSegmentByteSize); + InitParameters(kernelMetaNode); // Set the workgroup information for the kernel workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; @@ -784,25 +1262,49 @@ bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaN workGroupInfo_.availableVGPRs_ = 256; // extract the attribute metadata if there is any - amd_comgr_metadata_node_t attrMeta; amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS; - if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::iterate_map_metadata(attrMeta, device::populateAttrs, - static_cast(kernelMD)); - amd::Comgr::destroy_metadata(attrMeta); + + switch (codeObjectVer()) { + case 2: { + amd_comgr_metadata_node_t symbolName; + status = amd::Comgr::metadata_lookup(kernelMetaNode, "SymbolName", &symbolName); + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(symbolName, &(kernelMD->mSymbolName)); + amd::Comgr::destroy_metadata(symbolName); + } + + amd_comgr_metadata_node_t attrMeta; + if (status == AMD_COMGR_STATUS_SUCCESS) { + if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == + AMD_COMGR_STATUS_SUCCESS) { + status = amd::Comgr::iterate_map_metadata(attrMeta, populateAttrs, + static_cast(kernelMD)); + amd::Comgr::destroy_metadata(attrMeta); + } + } + + // extract the code properties metadata + amd_comgr_metadata_node_t codePropsMeta; + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd::Comgr::iterate_map_metadata(codePropsMeta, populateCodeProps, + static_cast(kernelMD)); + amd::Comgr::destroy_metadata(codePropsMeta); + } + } + break; + case 3: { + status = amd::Comgr::iterate_map_metadata(kernelMetaNode, populateKernelMetaV3, + static_cast(kernelMD)); + } + break; + default: + return false; } - // extract the code properties metadata - amd_comgr_metadata_node_t codePropsMeta; - if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta); - } - - if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::iterate_map_metadata(codePropsMeta, device::populateCodeProps, - static_cast(kernelMD)); - amd::Comgr::destroy_metadata(codePropsMeta); - } if (status != AMD_COMGR_STATUS_SUCCESS) { return false; @@ -917,19 +1419,21 @@ bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD, return (status == AMD_COMGR_STATUS_SUCCESS); } -void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) { +void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) { // Iterate through the arguments and insert into parameterList device::Kernel::parameters_t params; device::Kernel::parameters_t hiddenParams; amd::KernelParameterDescriptor desc; size_t offset = 0; - size_t offsetStruct = argBufferSize; amd_comgr_metadata_node_t argsMeta; bool hsaArgsMeta = false; size_t argsSize; - amd_comgr_status_t status = amd::Comgr::metadata_lookup(kernelMD, "Args", &argsMeta); + amd_comgr_status_t status = amd::Comgr::metadata_lookup( + kernelMD, + (codeObjectVer() == 2) ? "Args" : ".args", + &argsMeta); if (status == AMD_COMGR_STATUS_SUCCESS) { hsaArgsMeta = true; status = amd::Comgr::get_metadata_list_size(argsMeta, &argsSize); @@ -940,7 +1444,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a } for (size_t i = 0; i < argsSize; ++i) { - KernelArgMD lcArg; + KernelArgMD lcArg; amd_comgr_metadata_node_t argsNode; amd_comgr_metadata_kind_t kind; @@ -956,7 +1460,13 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a status = AMD_COMGR_STATUS_ERROR; } if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, static_cast(&lcArg)); + void *data = static_cast(&lcArg); + if (codeObjectVer() == 2) { + status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data); + } + else if (codeObjectVer() == 3) { + status = amd::Comgr::iterate_map_metadata(argsNode, populateArgsV3, data); + } } if (hsaArgsNode) { @@ -971,7 +1481,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a } size_t size = GetArgSizeOCL(lcArg); - size_t alignment = GetArgAlignmentOCL(lcArg); + size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0; bool isHidden = false; desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden); @@ -980,7 +1490,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) { setDynamicParallelFlag(true); } - offset = amd::alignUp(offset, alignment); + offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg); desc.offset_ = offset; desc.size_ = size; offset += size; @@ -1007,7 +1517,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a offset += sizeof(uint64_t); } else { - offset = amd::alignUp(offset, alignment); + offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg); desc.offset_ = offset; offset += size; } diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp index 6d1f289460..2ca7af5395 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.hpp +++ b/projects/clr/rocclr/runtime/device/devkernel.hpp @@ -34,6 +34,7 @@ using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; using llvm::AMDGPU::HSAMD::ValueKind; using llvm::AMDGPU::HSAMD::ValueType; +// for Code Object V3 enum class ArgField : uint8_t { Name = 0, TypeName = 1, @@ -48,7 +49,8 @@ enum class ArgField : uint8_t { IsConst = 10, IsRestrict = 11, IsVolatile = 12, - IsPipe = 13 + IsPipe = 13, + Offset = 14 }; enum class AttrField : uint8_t { @@ -167,6 +169,116 @@ static const std::map CodePropFieldMap = {"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs}, {"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs} }; + +// for Code Object V3 +enum class KernelField : uint8_t { + SymbolName = 0, + ReqdWorkGroupSize = 1, + WorkGroupSizeHint = 2, + VecTypeHint = 3, + DeviceEnqueueSymbol = 4, + KernargSegmentSize = 5, + GroupSegmentFixedSize = 6, + PrivateSegmentFixedSize = 7, + KernargSegmentAlign = 8, + WavefrontSize = 9, + NumSGPRs = 10, + NumVGPRs = 11, + MaxFlatWorkGroupSize = 12, + NumSpilledSGPRs = 13, + NumSpilledVGPRs = 14 +}; + +static const std::map ArgFieldMapV3 = +{ + {".name", ArgField::Name}, + {".type_name", ArgField::TypeName}, + {".size", ArgField::Size}, + {".offset", ArgField::Offset}, + {".value_kind", ArgField::ValueKind}, + {".vaule_type", ArgField::ValueType}, + {".pointee_align", ArgField::PointeeAlign}, + {".address_space", ArgField::AddrSpaceQual}, + {".access", ArgField::AccQual}, + {".actual_access", ArgField::ActualAccQual}, + {".is_const", ArgField::IsConst}, + {".is_restrict", ArgField::IsRestrict}, + {".is_volatile", ArgField::IsVolatile}, + {".is_pipe", ArgField::IsPipe} +}; + +static const std::map ArgValueKindV3 = +{ + {"by_value", ValueKind::ByValue}, + {"global_buffer", ValueKind::GlobalBuffer}, + {"dynamic_shared_pointer", ValueKind::DynamicSharedPointer}, + {"sampler", ValueKind::Sampler}, + {"image", ValueKind::Image}, + {"pipe", ValueKind::Pipe}, + {"queue", ValueKind::Queue}, + {"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX}, + {"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY}, + {"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ}, + {"hidden_none", ValueKind::HiddenNone}, + {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer}, + {"hidden_default_queue", ValueKind::HiddenDefaultQueue}, + {"hidden_completion_action", ValueKind::HiddenCompletionAction} +}; + +static const std::map ArgValueTypeV3 = +{ + {"struct", ValueType::Struct}, + {"i8", ValueType::I8}, + {"u8", ValueType::U8}, + {"i16", ValueType::I16}, + {"u16", ValueType::U16}, + {"f16", ValueType::F16}, + {"i32", ValueType::I32}, + {"u32", ValueType::U32}, + {"f32", ValueType::F32}, + {"i64", ValueType::I64}, + {"u64", ValueType::U64}, + {"f64", ValueType::F64} +}; + +static const std::map ArgAccQualV3 = +{ + {"default", AccessQualifier::Default}, + {"read_only", AccessQualifier::ReadOnly}, + {"write_only", AccessQualifier::WriteOnly}, + {"read_write", AccessQualifier::ReadWrite} +}; + +static const std::map ArgAddrSpaceQualV3 = +{ + {"private", AddressSpaceQualifier::Private}, + {"global", AddressSpaceQualifier::Global}, + {"constant", AddressSpaceQualifier::Constant}, + {"local", AddressSpaceQualifier::Local}, + {"generic", AddressSpaceQualifier::Generic}, + {"region", AddressSpaceQualifier::Region} +}; + +static const std::map KernelFieldMapV3 = +{ + {".symbol", KernelField::SymbolName}, + {".reqd_workgroup_size", KernelField::ReqdWorkGroupSize}, + {".workgorup_size_hint", KernelField::WorkGroupSizeHint}, + {".vec_type_hint", KernelField::VecTypeHint}, + {".device_enqueue_symbol", KernelField::DeviceEnqueueSymbol}, + {".kernarg_segment_size", KernelField::KernargSegmentSize}, + {".group_segment_fixed_size", KernelField::GroupSegmentFixedSize}, + {".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize}, + {".kernarg_segment_align", KernelField::KernargSegmentAlign}, + {".wavefront_size", KernelField::WavefrontSize}, + {".sgpr_count", KernelField::NumSGPRs}, + {".vgpr_count", KernelField::NumVGPRs}, + {".max_flat_workgroup_size", KernelField::MaxFlatWorkGroupSize}, + {".sgpr_spill_count", KernelField::NumSpilledSGPRs}, + {".vgpr_spill_count", KernelField::NumSpilledVGPRs} +}; + + #endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) @@ -234,6 +346,8 @@ struct KernelParameterDescriptor { namespace device { +class Program; + //! Printf info structure struct PrintfInfo { std::string fmtString_; //!< formated string for printf @@ -272,7 +386,7 @@ class Kernel : public amd::HeapObject { }; //! Default constructor - Kernel(const amd::Device& dev, const std::string& name); + Kernel(const amd::Device& dev, const std::string& name, const Program& prog); //! Default destructor virtual ~Kernel(); @@ -372,7 +486,7 @@ class Kernel : public amd::HeapObject { //! Initializes the abstraction layer kernel parameters #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) - void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize); + void InitParameters(const amd_comgr_metadata_node_t kernelMD); //! Get ther kernel metadata bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD, @@ -381,7 +495,6 @@ class Kernel : public amd::HeapObject { //! Retrieve kernel attribute and code properties metadata bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode, - const uint32_t kernargSegmentByteSize, KernelMD* kernelMD); //! Retrieve the available SGPRs and VGPRs @@ -390,6 +503,12 @@ class Kernel : public amd::HeapObject { //! Retrieve the printf string metadata bool GetPrintfStr(const amd_comgr_metadata_node_t programMD, std::vector* printfStr); + + //! Returns the kernel symbol name + const std::string& symbolName() const { return symbolName_; } + + //! Returns the kernel code object version + const uint32_t codeObjectVer() const { return prog().codeObjectVer(); } #else void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize); #endif @@ -404,8 +523,13 @@ class Kernel : public amd::HeapObject { //! Initializes HSAIL Printf metadata and info void InitPrintf(const aclPrintfFmt* aclPrintf); #endif + //! Returns program associated with this kernel + const Program& prog() const { return prog_; } + const amd::Device& dev_; //!< GPU device object std::string name_; //!< kernel name + const Program& prog_; //!< Reference to the parent program + std::string symbolName_; //!< kernel symbol name WorkGroupInfo workGroupInfo_; //!< device kernel info structure amd::KernelSignature* signature_; //!< kernel signature std::string buildLog_; //!< build log @@ -424,6 +548,7 @@ class Kernel : public amd::HeapObject { Flags() : value_(0) {} } flags_; + private: //! Disable default copy constructor Kernel(const Kernel&); @@ -447,264 +572,5 @@ static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, return status; } - -static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, - const amd_comgr_metadata_node_t value, - void *data) { - amd_comgr_status_t status; - amd_comgr_metadata_kind_t kind; - std::string buf; - - // get the key of the argument field - size_t size = 0; - status = amd::Comgr::get_metadata_kind(key, &kind); - if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { - status = getMetaBuf(key, &buf); - } - - if (status != AMD_COMGR_STATUS_SUCCESS) { - return AMD_COMGR_STATUS_ERROR; - } - - auto itArgField = ArgFieldMap.find(buf); - if (itArgField == ArgFieldMap.end()) { - return AMD_COMGR_STATUS_ERROR; - } - - // get the value of the argument field - status = getMetaBuf(value, &buf); - - KernelArgMD* lcArg = static_cast(data); - - switch (itArgField->second) { - case ArgField::Name: - lcArg->mName = buf; - break; - case ArgField::TypeName: - lcArg->mTypeName = buf; - break; - case ArgField::Size: - lcArg->mSize = atoi(buf.c_str()); - break; - case ArgField::Align: - lcArg->mAlign = atoi(buf.c_str()); - break; - case ArgField::ValueKind: - { - auto itValueKind = ArgValueKind.find(buf); - if (itValueKind == ArgValueKind.end()) { - return AMD_COMGR_STATUS_ERROR; - } - lcArg->mValueKind = itValueKind->second; - } - break; - case ArgField::ValueType: - { - auto itValueType = ArgValueType.find(buf); - if (itValueType == ArgValueType.end()) { - return AMD_COMGR_STATUS_ERROR; - } - lcArg->mValueType = itValueType->second; - } - break; - case ArgField::PointeeAlign: - lcArg->mPointeeAlign = atoi(buf.c_str()); - break; - case ArgField::AddrSpaceQual: - { - auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf); - if (itAddrSpaceQual == ArgAddrSpaceQual.end()) { - return AMD_COMGR_STATUS_ERROR; - } - lcArg->mAddrSpaceQual = itAddrSpaceQual->second; - } - break; - case ArgField::AccQual: - { - auto itAccQual = ArgAccQual.find(buf); - if (itAccQual == ArgAccQual.end()) { - return AMD_COMGR_STATUS_ERROR; - } - lcArg->mAccQual = itAccQual->second; - } - break; - case ArgField::ActualAccQual: - { - auto itAccQual = ArgAccQual.find(buf); - if (itAccQual == ArgAccQual.end()) { - return AMD_COMGR_STATUS_ERROR; - } - lcArg->mActualAccQual = itAccQual->second; - } - break; - case ArgField::IsConst: - lcArg->mIsConst = (buf.compare("true") == 0); - break; - case ArgField::IsRestrict: - lcArg->mIsRestrict = (buf.compare("true") == 0); - break; - case ArgField::IsVolatile: - lcArg->mIsVolatile = (buf.compare("true") == 0); - break; - case ArgField::IsPipe: - lcArg->mIsPipe = (buf.compare("true") == 0); - break; - default: - return AMD_COMGR_STATUS_ERROR; - } - return AMD_COMGR_STATUS_SUCCESS; -} - -static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key, - const amd_comgr_metadata_node_t value, - void *data) { - amd_comgr_status_t status; - amd_comgr_metadata_kind_t kind; - size_t size = 0; - std::string buf; - - // get the key of the argument field - status = amd::Comgr::get_metadata_kind(key, &kind); - if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { - status = getMetaBuf(key, &buf); - } - - if (status != AMD_COMGR_STATUS_SUCCESS) { - return AMD_COMGR_STATUS_ERROR; - } - - auto itAttrField = AttrFieldMap.find(buf); - if (itAttrField == AttrFieldMap.end()) { - return AMD_COMGR_STATUS_ERROR; - } - - KernelMD* kernelMD = static_cast(data); - switch (itAttrField->second) { - case AttrField::ReqdWorkGroupSize: - { - status = amd::Comgr::get_metadata_list_size(value, &size); - if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) { - for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { - amd_comgr_metadata_node_t workgroupSize; - status = amd::Comgr::index_list_metadata(value, i, &workgroupSize); - - if (status == AMD_COMGR_STATUS_SUCCESS && - getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) { - kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str())); - } - amd::Comgr::destroy_metadata(workgroupSize); - } - } - } - break; - case AttrField::WorkGroupSizeHint: - { - status = amd::Comgr::get_metadata_list_size(value, &size); - if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) { - for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { - amd_comgr_metadata_node_t workgroupSizeHint; - status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint); - - if (status == AMD_COMGR_STATUS_SUCCESS && - getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) { - kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str())); - } - amd::Comgr::destroy_metadata(workgroupSizeHint); - } - } - } - break; - case AttrField::VecTypeHint: - { - if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { - kernelMD->mAttrs.mVecTypeHint = buf; - } - } - break; - case AttrField::RuntimeHandle: - { - if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { - kernelMD->mAttrs.mRuntimeHandle = buf; - } - } - break; - default: - return AMD_COMGR_STATUS_ERROR; - } - - return status; -} - -static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key, - const amd_comgr_metadata_node_t value, - void *data) { - amd_comgr_status_t status; - amd_comgr_metadata_kind_t kind; - std::string buf; - - // get the key of the argument field - status = amd::Comgr::get_metadata_kind(key, &kind); - if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { - status = getMetaBuf(key, &buf); - } - - if (status != AMD_COMGR_STATUS_SUCCESS) { - return AMD_COMGR_STATUS_ERROR; - } - - auto itCodePropField = CodePropFieldMap.find(buf); - if (itCodePropField == CodePropFieldMap.end()) { - return AMD_COMGR_STATUS_ERROR; - } - - // get the value of the argument field - if (status == AMD_COMGR_STATUS_SUCCESS) { - status = getMetaBuf(value, &buf); - } - - KernelMD* kernelMD = static_cast(data); - switch (itCodePropField->second) { - case CodePropField::KernargSegmentSize: - kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); - break; - case CodePropField::GroupSegmentFixedSize: - kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); - break; - case CodePropField::PrivateSegmentFixedSize: - kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str()); - break; - case CodePropField::KernargSegmentAlign: - kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str()); - break; - case CodePropField::WavefrontSize: - kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str()); - break; - case CodePropField::NumSGPRs: - kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str()); - break; - case CodePropField::NumVGPRs: - kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str()); - break; - case CodePropField::MaxFlatWorkGroupSize: - kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str()); - break; - case CodePropField::IsDynamicCallStack: - kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0); - break; - case CodePropField::IsXNACKEnabled: - kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0); - break; - case CodePropField::NumSpilledSGPRs: - kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str()); - break; - case CodePropField::NumSpilledVGPRs: - kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str()); - break; - default: - return AMD_COMGR_STATUS_ERROR; - } - return AMD_COMGR_STATUS_SUCCESS; -} -#endif - +#endif // defined(USE_COMGR_LIBRARY) } // namespace device diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp index d43d8daedc..f40ae71adb 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.cpp +++ b/projects/clr/rocclr/runtime/device/devprogram.cpp @@ -706,8 +706,6 @@ bool Program::compileImplLC(const std::string& sourceCode, driverOptions.append(options->llvmOptions); driverOptions.append(ProcessOptions(options)); - // Force object code v2. - driverOptions.append(" -mno-code-object-v3"); // Set whole program mode driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink"); @@ -1532,8 +1530,6 @@ bool Program::linkImplLC(amd::option::Options* options) { codegenOptions.append(" -mno-sram-ecc"); } - // Force object code v2. - codegenOptions.append(" -mno-code-object-v3"); // Set whole program mode codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); @@ -1764,8 +1760,6 @@ bool Program::linkImplLC(amd::option::Options* options) { std::ostream_iterator(ostrstr, " ")); codegenOptions.append(" ").append(ostrstr.str()); - // Force object code v2. - codegenOptions.append(" -mno-code-object-v3"); // Set whole program mode codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); @@ -2880,8 +2874,19 @@ bool Program::createKernelMetadataMap() { status = amd::Comgr::metadata_lookup(*metadata_, "Kernels", &kernelsMD); if (status == AMD_COMGR_STATUS_SUCCESS) { + LogInfo("Using Code Object V2."); hasKernelMD = true; - status = amd::Comgr::get_metadata_list_size(kernelsMD, &size); + codeObjectVer_ = 2; + } + else { + status = amd::Comgr::metadata_lookup(*metadata_, "amdhsa.kernels", &kernelsMD); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + LogInfo("Using Code Object V3."); + hasKernelMD = true; + codeObjectVer_ = 3; + status = amd::Comgr::get_metadata_list_size(kernelsMD, &size); + } } for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { @@ -2896,7 +2901,9 @@ bool Program::createKernelMetadataMap() { if (status == AMD_COMGR_STATUS_SUCCESS) { hasKernelNode = true; - status = amd::Comgr::metadata_lookup(kernelNode, "Name", &nameMeta); + status = amd::Comgr::metadata_lookup(kernelNode, + (codeObjectVer() == 2) ? "Name" : ".name", + &nameMeta); } if (status == AMD_COMGR_STATUS_SUCCESS) { @@ -2970,9 +2977,10 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) { buildLog_ += "Error: object code with old metadata is not supported\n"; return false; } - else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ && - note->n_namesz == sizeof "AMD" && - !memcmp(name, "AMD", note->n_namesz)) { + else if ((note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ && + note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) || + (note->n_type == 32 /* NT_AMD_AMDGPU_HSA_METADATA V3 */ && + note->n_namesz == sizeof "AMDGPU" && !memcmp(name, "AMDGPU", note->n_namesz))) { #if defined(USE_COMGR_LIBRARY) amd_comgr_status_t status; amd_comgr_data_t binaryData; diff --git a/projects/clr/rocclr/runtime/device/devprogram.hpp b/projects/clr/rocclr/runtime/device/devprogram.hpp index 8de0653a64..b35cb0771e 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.hpp +++ b/projects/clr/rocclr/runtime/device/devprogram.hpp @@ -118,6 +118,7 @@ class Program : public amd::HeapObject { #if defined(USE_COMGR_LIBRARY) amd_comgr_metadata_node_t* metadata_; //!< COMgr metadata + uint32_t codeObjectVer_; //!< version of code object std::map kernelMetadataMap_; //!< Map of kernel metadata #else CodeObjectMD* metadata_; //!< Runtime metadata @@ -211,6 +212,8 @@ class Program : public amd::HeapObject { auto it = kernelMetadataMap_.find(name); return (it == kernelMetadataMap_.end()) ? nullptr : &(it->second); } + + const uint32_t codeObjectVer() const { return codeObjectVer_; } #else const CodeObjectMD* metadata() const { return metadata_; } #endif diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index 8eb574eed6..5f861f2a6a 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -515,10 +515,9 @@ clk_value_type_t KernelArg::type() const { NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev, const NullProgram& nullprog) - : device::Kernel(gpuNullDev, name), + : device::Kernel(gpuNullDev, name, nullprog), buildError_(CL_BUILD_PROGRAM_FAILURE), gpuDev_(gpuNullDev), - prog_(nullprog), calRef_(NULL), internal_(false), flags_(0), @@ -3027,9 +3026,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum) - : device::Kernel(prog->dev(), name), + : device::Kernel(prog->dev(), name, *prog), compileOptions_(compileOptions), - prog_(*prog), index_(0), code_(NULL), codeSize_(0), diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index a60ada3dad..98d660c470 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -406,7 +406,7 @@ class NullKernel : public device::Kernel { const NullDevice& nullDev() const { return gpuDev_; } //! Returns GPU device object, associated with this kernel - const NullProgram& nullProg() const { return prog_; } + const NullProgram& nullProg() const { return reinterpret_cast(prog_); } //! Returns the kernel's build error const cl_int buildError() const { return buildError_; } @@ -455,7 +455,6 @@ class NullKernel : public device::Kernel { std::string ilSource_; //!< IL source code of this kernel const NullDevice& gpuDev_; //!< GPU device object - const NullProgram& prog_; //!< Reference to the parent program CalImageReference* calRef_; //!< CAL image reference for this kernel bool internal_; //!< Runtime internal ker @@ -847,7 +846,6 @@ class HSAILKernel : public device::Kernel { std::vector arguments_; //!< Vector list of HSAIL Arguments std::string compileOptions_; //!< compile used for finalizing this kernel amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU - const HSAILProgram& prog_; //!< Reference to the parent program uint index_; //!< Kernel index in the program gpu::Memory* code_; //!< Memory object with ISA code diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index 7bbdbfbb5b..e23389876b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -26,37 +26,17 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; namespace pal { -bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { - if (!sym) { - return false; - } - if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { - return false; - } - - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, - reinterpret_cast(&codeSize_))) { - return false; - } - - amd_kernel_code_t* akc = &akc_; - // Copy codeobject of this kernel from the program CPU segment - memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t)); - - size_t akc_align = 0; - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, - reinterpret_cast(&akc_align))) { - return false; - } - +void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize, + const uint32_t groupSegmentSize, + const uint16_t numSGPRs, + const uint16_t numVGPRs) { workGroupInfo_.scratchRegs_ = - amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); - workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; - workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = - akc->workgroup_group_segment_byte_size; - workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; + amd::alignUp(privateSegmentSize, 16) / sizeof(uint); + workGroupInfo_.privateMemSize_ = privateSegmentSize; + workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize; + workGroupInfo_.usedSGPRs_ = numSGPRs; workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; + workGroupInfo_.usedVGPRs_ = numVGPRs; if (!prog().isNull()) { workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu; @@ -72,16 +52,57 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { workGroupInfo_.availableVGPRs_ = 256; workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64; } +} + +bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) { + if (!sym) { + return false; + } + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { + return false; + } + + // Copy code object of this kernel from the program CPU segment + memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t)); + + return true; +} + +bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { + + amd_kernel_code_t* akc = &akc_; + + if (!setKernelCode(sym, akc)) { + return false; + } + + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, + reinterpret_cast(&codeSize_))) { + return false; + } + + // Setup the the workgroup info + setWorkGroupInfo(akc->workitem_private_segment_byte_size, + akc->workgroup_group_segment_byte_size, + akc->wavefront_sgpr_count, + akc->workitem_vgpr_count); + + workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_; + kernargSegmentByteSize_ = akc->kernarg_segment_byte_size; + spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t)); + return true; } HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) - : device::Kernel(prog->dev(), name), + : device::Kernel(prog->dev(), name, *prog), compileOptions_(compileOptions), - prog_(*prog), index_(0), code_(0), - codeSize_(0) + codeSize_(0), + workgroupGroupSegmentByteSize_(0), + kernargSegmentByteSize_(0), + spillSegmentByteSize_(0) { flags_.hsa_ = true; } @@ -388,37 +409,54 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s } #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) -bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { -#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#if defined(USE_COMGR_LIBRARY) +bool LightningKernel::init() { flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; - aqlCreateHWInfo(symbol); - -#if defined(USE_COMGR_LIBRARY) const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name()); if (kernelMetaNode == nullptr) { return false; } KernelMD kernelMD; - if (!GetAttrCodePropMetadata(*kernelMetaNode, argsBufferSize(), &kernelMD)) { + if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) { return false; } + symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName; + + workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize; + spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, + sizeof(uint32_t)); + kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize; + + // Copy codeobject of this kernel from the program CPU segment + hsa_agent_t agent; + agent.handle = 1; + + auto sym = prog().GetSymbol(symbolName().c_str(), const_cast(&agent)); + + if (!setKernelCode(sym, &akc_)) { + return false; + } + + codeSize_ = prog().codeSegGpu().owner()->getSize(); + + // handle device enqueue if (!kernelMD.mAttrs.mRuntimeHandle.empty()) { hsa_agent_t agent; agent.handle = 1; amd::hsa::loader::Symbol* rth_symbol; // Get the runtime handle symbol GPU address - rth_symbol = prog_.GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()), + rth_symbol = prog().GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()), const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); // Copy the kernel_object pointer to the runtime handle symbol GPU address - const Memory& codeSegGpu = prog_.codeSegGpu(); + const Memory& codeSegGpu = prog().codeSegGpu(); uint64_t offset = symbol_address - codeSegGpu.vmAddress(); uint64_t kernel_object = gpuAqlCode(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); @@ -432,6 +470,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true); } + // Setup the the workgroup info + setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize, + kernelMD.mCodeProps.mGroupSegmentFixedSize, + kernelMD.mCodeProps.mNumSGPRs, + kernelMD.mCodeProps.mNumVGPRs); + // Copy wavefront size workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; @@ -452,7 +496,18 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { if (!printfStr.empty()) { InitPrintf(printfStr); } -#else + + return true; +} +#endif // defined(USE_COMGR_LIBRARY) + +bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { +#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) + flags_.internalKernel_ = + (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; + + aqlCreateHWInfo(symbol); + const CodeObjectMD* programMD = prog().metadata(); assert(programMD != nullptr); @@ -489,13 +544,13 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { amd::hsa::loader::Symbol* rth_symbol; // Get the runtime handle symbol GPU address - rth_symbol = prog_.GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()), + rth_symbol = prog().GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()), const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); // Copy the kernel_object pointer to the runtime handle symbol GPU address - const Memory& codeSegGpu = prog_.codeSegGpu(); + const Memory& codeSegGpu = prog().codeSegGpu(); uint64_t offset = symbol_address - codeSegGpu.vmAddress(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); @@ -529,8 +584,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { waveLimiter_.enable(); */ -#endif // defined(USE_COMGR_LIBRARY) -#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) return true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index e5041f3f92..5a1abe07d4 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -59,7 +59,7 @@ class HSAILKernel : public device::Kernel { const HSAILProgram& prog() const; //! Returns LDS size used in this kernel - uint32_t ldsSize() const { return akc_.workgroup_group_segment_byte_size; } + uint32_t ldsSize() const { return workgroupGroupSegmentByteSize_; } //! Returns pointer on CPU to AQL code info const amd_kernel_code_t* cpuAqlCode() const { return &akc_; } @@ -71,10 +71,10 @@ class HSAILKernel : public device::Kernel { size_t aqlCodeSize() const { return codeSize_; } //! Returns the size of argument buffer - size_t argsBufferSize() const { return akc_.kernarg_segment_byte_size; } + size_t argsBufferSize() const { return kernargSegmentByteSize_; } //! Returns spill reg size per workitem - uint32_t spillSegSize() const { return amd::alignUp(akc_.workitem_private_segment_byte_size, sizeof(uint32_t)); } + uint32_t spillSegSize() const { return spillSegmentByteSize_; } //! Returns AQL packet in CPU memory //! if the kernel arguments were successfully loaded, otherwise NULL @@ -102,13 +102,25 @@ class HSAILKernel : public device::Kernel { //! Creates AQL kernel HW info bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym); + //! Get the kernel code and copy the code object from the program CPU segment + bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc); + + //! Set up the workgroup info based on the kernel metadata + void setWorkGroupInfo(const uint32_t privateSegmentSize, + const uint32_t groupSegmentSize, + const uint16_t numSGPRs, + const uint16_t numVGPRs); + std::string compileOptions_; //!< compile used for finalizing this kernel amd_kernel_code_t akc_; //!< AQL kernel code on CPU - const HSAILProgram& prog_; //!< Reference to the parent program uint index_; //!< Kernel index in the program uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code + + uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel + uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer + uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem }; class LightningKernel : public HSAILKernel { @@ -121,6 +133,11 @@ class LightningKernel : public HSAILKernel { //! Initializes the metadata required for this kernel, bool init(amd::hsa::loader::Symbol* symbol); + +#if defined(USE_COMGR_LIBRARY) + //! Initializes the metadata required for this kernel, + bool init(); +#endif }; /*@}*/} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index 1098f8864d..ea2762c0ff 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -747,6 +747,25 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s return false; } +#if defined(USE_COMGR_LIBRARY) + for (const auto &kernelMeta : kernelMetadataMap_) { + auto kernelName = kernelMeta.first; + auto kernel = new LightningKernel(kernelName, this, + options->origOptionStr + ProcessOptions(options)); + kernels()[kernelName] = kernel; + + if (!kernel->init()) { + return false; + } + + kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = + std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + } +#else // Get the list of kernels std::vector kernelNameList; status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList); @@ -781,7 +800,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s maxScratchRegs_ = std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); } - +#endif // defined(USE_COMGR_LIBRARY) DestroySegmentCpuAccess(); #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) return true; diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp index 65d14d2411..7118a6f3c7 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp @@ -166,7 +166,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp, } // Update GPU specific settings and info structure if we have any +#if defined(_WIN32) + ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL}; +#else ModifyMaxWorkload modifyMaxWorkload = {0}; +#endif // APU systems if (palProp.gpuType == Pal::GpuType::Integrated) { @@ -326,18 +330,22 @@ bool Settings::create(const Pal::DeviceProperties& palProp, return false; } -#if defined(_WIN32) - if (modifyMaxWorkload.time > 0) { - OSVERSIONINFOEX versionInfo = {0}; - versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - versionInfo.dwMajorVersion = 6; - versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; + splitSizeForWin7_ = false; - DWORDLONG conditionMask = 0; - VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); - VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); - if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { - maxWorkloadTime_ = modifyMaxWorkload.time; +#if defined(_WIN32) + OSVERSIONINFOEX versionInfo = {0}; + versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + versionInfo.dwMajorVersion = 6; + versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; + + DWORDLONG conditionMask = 0; + VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); + VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); + + if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { + splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7 + if (modifyMaxWorkload.time > 0) { + maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time } } #endif // defined(_WIN32) diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp index 63f61ae99a..b6e1d95441 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp @@ -60,7 +60,8 @@ class Settings : public device::Settings { uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace uint rgpSqttForceDisable_: 1; //!< Disables SQTT - uint reserved_ : 12; + uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7 + uint reserved_ : 11; }; uint value_; }; diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 359331cc85..13d107baa2 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -553,6 +553,11 @@ void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) { void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads, uint instructions) { + if (!dev.settings().splitSizeForWin7_) { + dispatchSplitSize_ = 0; + return; + } + uint64_t workload = threads * instructions; if (maxDispatchWorkload_ < workload) { dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); @@ -2258,6 +2263,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; + dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize(); + dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); // Run AQL dispatch in HW eventBegin(MainEngine); iCmd()->CmdDispatchAql(dispatchParam); diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index c05197fe5d..8a28accfe4 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -25,41 +25,58 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle const uint32_t workgroupGroupSegmentByteSize, const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, const uint32_t kernargSegmentAlignment) - : device::Kernel(prog->dev(), name), - program_(prog), + : device::Kernel(prog->dev(), name, *prog), kernelCodeHandle_(kernelCodeHandle), workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize), workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize), kernargSegmentByteSize_(kernargSegmentByteSize), kernargSegmentAlignment_(kernargSegmentAlignment) {} +Kernel::Kernel(std::string name, Program* prog) + : device::Kernel(prog->dev(), name, *prog), + kernelCodeHandle_(0), + workgroupGroupSegmentByteSize_(0), + workitemPrivateSegmentByteSize_(0), + kernargSegmentByteSize_(0), + kernargSegmentAlignment_(0) {} + #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { - hsa_agent_t hsaDevice = program_->hsaDevice(); + hsa_agent_t hsaDevice = program()->hsaDevice(); const amd_comgr_metadata_node_t* kernelMetaNode = - static_cast(program_)->getKernelMetadata(name()); + static_cast(program())->getKernelMetadata(name()); if (kernelMetaNode == nullptr) { return false; } KernelMD kernelMD; - if (!GetAttrCodePropMetadata(*kernelMetaNode, KernargSegmentByteSize(), &kernelMD)) { + if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) { return false; } + // Set the kernel symbol name and size/alignment based on the kernel metadata + // NOTE: kernel name is used to get the kernel code handle in V2, + // but kernel symbol name is used in V3 + symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName; + workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize; + workitemPrivateSegmentByteSize_ = kernelMD.mCodeProps.mPrivateSegmentFixedSize; + kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize; + kernargSegmentAlignment_ = amd::alignUp(std::max(kernelMD.mCodeProps.mKernargSegmentAlign, 128u), + dev().info().globalMemCacheLineSize_); + // Set the workgroup information for the kernel workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; assert(workGroupInfo_.availableLDSSize_ > 0); // Get the available SGPRs and VGPRs - std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget(); - if (program_->xnackEnable()) { + std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program()->machineTarget(); + if (program()->xnackEnable()) { targetIdent.append("+xnack"); } - if (program_->sramEccEnable()) { + if (program()->sramEccEnable()) { targetIdent.append("+sram-ecc"); } @@ -67,10 +84,23 @@ bool LightningKernel::init() { return false; } + // Get the kernel code handle + hsa_status_t hsaStatus; + hsa_executable_symbol_t symbol; + hsa_agent_t agent = program()->hsaDevice(); + hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), + symbolName().c_str(), + &agent, &symbol); + if (hsaStatus == HSA_STATUS_SUCCESS) { + hsaStatus = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &kernelCodeHandle_); + } + if (hsaStatus != HSA_STATUS_SUCCESS) { + return false; + } + if (!kernelMD.mAttrs.mRuntimeHandle.empty()) { - hsa_agent_t agent = program_->hsaDevice(); hsa_executable_symbol_t kernelSymbol; - hsa_status_t hsaStatus; int variable_size; uint64_t variable_address; @@ -79,7 +109,7 @@ bool LightningKernel::init() { // object handle of such a kernel. The address of the variable and the kernel code object handle are known // only after the hsa executable is loaded. The below code copies the kernel code object handle to the // address of the variable. - hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), + hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD.mAttrs.mRuntimeHandle.c_str(), &agent, &kernelSymbol); if (hsaStatus == HSA_STATUS_SUCCESS) { @@ -87,11 +117,6 @@ bool LightningKernel::init() { HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &variable_size); } - if (hsaStatus == HSA_STATUS_SUCCESS) { - hsaStatus = hsa_executable_symbol_get_info(kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, - &variable_size); - } if (hsaStatus == HSA_STATUS_SUCCESS) { hsaStatus = hsa_executable_symbol_get_info(kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, @@ -114,7 +139,7 @@ bool LightningKernel::init() { } uint32_t wavefront_size = 0; - if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != + if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != HSA_STATUS_SUCCESS) { return false; } @@ -127,7 +152,7 @@ bool LightningKernel::init() { workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs; workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs; workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size; workGroupInfo_.wavefrontSize_ = wavefront_size; workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize; if (workGroupInfo_.size_ == 0) { @@ -135,7 +160,7 @@ bool LightningKernel::init() { } // handle the printf metadata if any - const amd_comgr_metadata_node_t* programMD = static_cast(program_)->metadata(); + const amd_comgr_metadata_node_t* programMD = static_cast(program())->metadata(); assert(programMD != nullptr); std::vector printfStr; @@ -159,10 +184,10 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s } bool LightningKernel::init() { - hsa_agent_t hsaDevice = program_->hsaDevice(); + hsa_agent_t hsaDevice = program()->hsaDevice(); // Pull out metadata from the ELF - const CodeObjectMD* programMD = static_cast(program_)->metadata(); + const CodeObjectMD* programMD = static_cast(program())->metadata(); assert(programMD != nullptr); const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); @@ -172,7 +197,7 @@ bool LightningKernel::init() { InitParameters(*kernelMD, KernargSegmentByteSize()); // Set the workgroup information for the kernel - workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; + workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_; assert(workGroupInfo_.availableLDSSize_ > 0); workGroupInfo_.availableSGPRs_ = 104; workGroupInfo_.availableVGPRs_ = 256; @@ -196,7 +221,7 @@ bool LightningKernel::init() { } if (!kernelMD->mAttrs.mRuntimeHandle.empty()) { - hsa_agent_t agent = program_->hsaDevice(); + hsa_agent_t agent = program()->hsaDevice(); hsa_executable_symbol_t kernelSymbol; hsa_status_t status; int variable_size; @@ -208,7 +233,7 @@ bool LightningKernel::init() { // only after the hsa executable is loaded. The below code copies the kernel code object handle to the // address of the variable. - status = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(), + status = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(), &agent, &kernelSymbol); if (status != HSA_STATUS_SUCCESS) { return false; @@ -239,7 +264,7 @@ bool LightningKernel::init() { } uint32_t wavefront_size = 0; - if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != + if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != HSA_STATUS_SUCCESS) { return false; } @@ -258,7 +283,7 @@ bool LightningKernel::init() { workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size; workGroupInfo_.wavefrontSize_ = wavefront_size; @@ -278,18 +303,18 @@ bool LightningKernel::init() { bool HSAILKernel::init() { acl_error errorCode; // compile kernel down to ISA - hsa_agent_t hsaDevice = program_->hsaDevice(); + hsa_agent_t hsaDevice = program()->hsaDevice(); // Pull out metadata from the ELF size_t sizeOfArgList; - aclCompiler* compileHandle = program_->dev().compiler(); + aclCompiler* compileHandle = program()->dev().compiler(); std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY, + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); if (errorCode != ACL_SUCCESS) { return false; } std::unique_ptr argList(new char[sizeOfArgList]); - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY, + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), argList.get(), &sizeOfArgList); if (errorCode != ACL_SUCCESS) { return false; @@ -300,17 +325,17 @@ bool HSAILKernel::init() { // Set the workgroup information for the kernel memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; + workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_; assert(workGroupInfo_.availableLDSSize_ > 0); workGroupInfo_.availableSGPRs_ = 104; workGroupInfo_.availableVGPRs_ = 256; size_t sizeOfWorkGroupSize; - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE, + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); if (errorCode != ACL_SUCCESS) { return false; } - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE, + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); if (errorCode != ACL_SUCCESS) { @@ -319,7 +344,7 @@ bool HSAILKernel::init() { uint32_t wavefront_size = 0; if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) { + hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) { return false; } assert(wavefront_size > 0); @@ -344,18 +369,18 @@ bool HSAILKernel::init() { } workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size; workGroupInfo_.wavefrontSize_ = wavefront_size; if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * workGroupInfo_.compileSize_[2]; } else { - workGroupInfo_.size_ = program_->dev().info().preferredWorkGroupSize_; + workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_; } // Pull out printf metadata from the ELF size_t sizeOfPrintfList; - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList); if (errorCode != ACL_SUCCESS) { return false; @@ -367,7 +392,7 @@ bool HSAILKernel::init() { if (!aclPrintfList) { return false; } - errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), + errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList.get(), &sizeOfPrintfList); if (errorCode != ACL_SUCCESS) { diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp index 8bf6ed674c..46fc1c2339 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp @@ -22,6 +22,8 @@ class Kernel : public device::Kernel { const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, const uint32_t kernargSegmentAlignment); + Kernel(std::string name, Program* prog); + const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; } const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; } @@ -37,15 +39,15 @@ class Kernel : public device::Kernel { //! Initializes the metadata required for this kernel virtual bool init() = 0; - const Program* program() const { return static_cast(program_); } + const Program* program() const { return static_cast(&prog_); } protected: - Program* program_; //!< The roc::Program context +// Program* program_; //!< The roc::Program context uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) - const uint32_t workgroupGroupSegmentByteSize_; - const uint32_t workitemPrivateSegmentByteSize_; - const uint32_t kernargSegmentByteSize_; - const uint32_t kernargSegmentAlignment_; + uint32_t workgroupGroupSegmentByteSize_; + uint32_t workitemPrivateSegmentByteSize_; + uint32_t kernargSegmentByteSize_; + uint32_t kernargSegmentAlignment_; size_t kernelDirectiveOffset_; }; @@ -74,6 +76,10 @@ class LightningKernel : public roc::Kernel { : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize, workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) { } + + LightningKernel(std::string name, Program* prog) + : roc::Kernel(name, prog) {} + //! Initializes the metadata required for this kernel virtual bool init() final; }; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp index 444377aeb2..191e58c0e7 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp @@ -488,6 +488,19 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s return false; } +#if defined(USE_COMGR_LIBRARY) + for (const auto &kernelMeta : kernelMetadataMap_) { + const std::string kernelName = kernelMeta.first; + Kernel* aKernel = new roc::LightningKernel(kernelName, this); + if (!aKernel->init()) { + return false; + } + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != + std::string::npos); + kernels()[kernelName] = aKernel; + } +#else // Get the list of kernels std::vector kernelNameList; status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback, @@ -582,6 +595,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s std::string::npos); kernels()[kernelName] = aKernel; } +#endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) return true; }