From d1f62b92c100942a7d7222c08ca5abfa724f49e9 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 9 Apr 2019 23:24:10 -0400
Subject: [PATCH] P4 to Git Change 1767752 by wchau@wchau_OCL_Linux on
2019/04/09 22:58:03
SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata
- Add support for the V3 code objects
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#336 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#134 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/inc/core/palCmdBuffer.h#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx6/gfx6ComputeCmdBuffer.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx9/gfx9ComputeCmdBuffer.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#77 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#130 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#103 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#47 edit
[ROCm/clr commit: 36a5f2a85f390c8e9ea34e5516584fc1afb3cbe8]
---
.../clr/rocclr/runtime/device/devkernel.cpp | 568 +++++++++++++++++-
.../clr/rocclr/runtime/device/devkernel.hpp | 394 ++++--------
.../clr/rocclr/runtime/device/devprogram.cpp | 30 +-
.../clr/rocclr/runtime/device/devprogram.hpp | 3 +
.../rocclr/runtime/device/gpu/gpukernel.cpp | 6 +-
.../rocclr/runtime/device/gpu/gpukernel.hpp | 4 +-
.../rocclr/runtime/device/pal/palkernel.cpp | 144 +++--
.../rocclr/runtime/device/pal/palkernel.hpp | 25 +-
.../rocclr/runtime/device/pal/palprogram.cpp | 21 +-
.../rocclr/runtime/device/pal/palsettings.cpp | 30 +-
.../rocclr/runtime/device/pal/palsettings.hpp | 3 +-
.../rocclr/runtime/device/pal/palvirtual.cpp | 7 +
.../rocclr/runtime/device/rocm/rockernel.cpp | 101 ++--
.../rocclr/runtime/device/rocm/rockernel.hpp | 18 +-
.../rocclr/runtime/device/rocm/rocprogram.cpp | 14 +
15 files changed, 951 insertions(+), 417 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
index 661c8d1127..761610a12e 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -29,10 +29,487 @@ using llvm::AMDGPU::HSAMD::ValueType;
namespace device {
+#if defined(USE_COMGR_LIBRARY)
+static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
+ const amd_comgr_metadata_node_t value,
+ void *data) {
+ amd_comgr_status_t status;
+ amd_comgr_metadata_kind_t kind;
+ std::string buf;
+
+ // get the key of the argument field
+ size_t size = 0;
+ status = amd::Comgr::get_metadata_kind(key, &kind);
+ if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(key, &buf);
+ }
+
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ auto itArgField = ArgFieldMap.find(buf);
+ if (itArgField == ArgFieldMap.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ // get the value of the argument field
+ status = getMetaBuf(value, &buf);
+
+ KernelArgMD* lcArg = static_cast(data);
+
+ switch (itArgField->second) {
+ case ArgField::Name:
+ lcArg->mName = buf;
+ break;
+ case ArgField::TypeName:
+ lcArg->mTypeName = buf;
+ break;
+ case ArgField::Size:
+ lcArg->mSize = atoi(buf.c_str());
+ break;
+ case ArgField::Align:
+ lcArg->mAlign = atoi(buf.c_str());
+ break;
+ case ArgField::ValueKind:
+ {
+ auto itValueKind = ArgValueKind.find(buf);
+ if (itValueKind == ArgValueKind.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mValueKind = itValueKind->second;
+ }
+ break;
+ case ArgField::ValueType:
+ {
+ auto itValueType = ArgValueType.find(buf);
+ if (itValueType == ArgValueType.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mValueType = itValueType->second;
+ }
+ break;
+ case ArgField::PointeeAlign:
+ lcArg->mPointeeAlign = atoi(buf.c_str());
+ break;
+ case ArgField::AddrSpaceQual:
+ {
+ auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
+ if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
+ }
+ break;
+ case ArgField::AccQual:
+ {
+ auto itAccQual = ArgAccQual.find(buf);
+ if (itAccQual == ArgAccQual.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mAccQual = itAccQual->second;
+ }
+ break;
+ case ArgField::ActualAccQual:
+ {
+ auto itAccQual = ArgAccQual.find(buf);
+ if (itAccQual == ArgAccQual.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mActualAccQual = itAccQual->second;
+ }
+ break;
+ case ArgField::IsConst:
+ lcArg->mIsConst = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsRestrict:
+ lcArg->mIsRestrict = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsVolatile:
+ lcArg->mIsVolatile = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsPipe:
+ lcArg->mIsPipe = (buf.compare("true") == 0);
+ break;
+ default:
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
+ const amd_comgr_metadata_node_t value,
+ void *data) {
+ amd_comgr_status_t status;
+ amd_comgr_metadata_kind_t kind;
+ size_t size = 0;
+ std::string buf;
+
+ // get the key of the argument field
+ status = amd::Comgr::get_metadata_kind(key, &kind);
+ if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(key, &buf);
+ }
+
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ auto itAttrField = AttrFieldMap.find(buf);
+ if (itAttrField == AttrFieldMap.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ KernelMD* kernelMD = static_cast(data);
+ switch (itAttrField->second) {
+ case AttrField::ReqdWorkGroupSize:
+ {
+ status = amd::Comgr::get_metadata_list_size(value, &size);
+ if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
+ for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+ amd_comgr_metadata_node_t workgroupSize;
+ status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
+
+ if (status == AMD_COMGR_STATUS_SUCCESS &&
+ getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
+ }
+ amd::Comgr::destroy_metadata(workgroupSize);
+ }
+ }
+ }
+ break;
+ case AttrField::WorkGroupSizeHint:
+ {
+ status = amd::Comgr::get_metadata_list_size(value, &size);
+ if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
+ for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+ amd_comgr_metadata_node_t workgroupSizeHint;
+ status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
+
+ if (status == AMD_COMGR_STATUS_SUCCESS &&
+ getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
+ }
+ amd::Comgr::destroy_metadata(workgroupSizeHint);
+ }
+ }
+ }
+ break;
+ case AttrField::VecTypeHint:
+ {
+ if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mVecTypeHint = buf;
+ }
+ }
+ break;
+ case AttrField::RuntimeHandle:
+ {
+ if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mRuntimeHandle = buf;
+ }
+ }
+ break;
+ default:
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ return status;
+}
+
+static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
+ const amd_comgr_metadata_node_t value,
+ void *data) {
+ amd_comgr_status_t status;
+ amd_comgr_metadata_kind_t kind;
+ std::string buf;
+
+ // get the key of the argument field
+ status = amd::Comgr::get_metadata_kind(key, &kind);
+ if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(key, &buf);
+ }
+
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ auto itCodePropField = CodePropFieldMap.find(buf);
+ if (itCodePropField == CodePropFieldMap.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ // get the value of the argument field
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(value, &buf);
+ }
+
+ KernelMD* kernelMD = static_cast(data);
+ switch (itCodePropField->second) {
+ case CodePropField::KernargSegmentSize:
+ kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
+ break;
+ case CodePropField::GroupSegmentFixedSize:
+ kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
+ break;
+ case CodePropField::PrivateSegmentFixedSize:
+ kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
+ break;
+ case CodePropField::KernargSegmentAlign:
+ kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
+ break;
+ case CodePropField::WavefrontSize:
+ kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
+ break;
+ case CodePropField::NumSGPRs:
+ kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
+ break;
+ case CodePropField::NumVGPRs:
+ kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
+ break;
+ case CodePropField::MaxFlatWorkGroupSize:
+ kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
+ break;
+ case CodePropField::IsDynamicCallStack:
+ kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
+ break;
+ case CodePropField::IsXNACKEnabled:
+ kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
+ break;
+ case CodePropField::NumSpilledSGPRs:
+ kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
+ break;
+ case CodePropField::NumSpilledVGPRs:
+ kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
+ break;
+ default:
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key,
+ const amd_comgr_metadata_node_t value,
+ void *data) {
+ amd_comgr_status_t status;
+ amd_comgr_metadata_kind_t kind;
+ std::string buf;
+
+ // get the key of the argument field
+ size_t size = 0;
+ status = amd::Comgr::get_metadata_kind(key, &kind);
+ if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(key, &buf);
+ }
+
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ auto itArgField = ArgFieldMapV3.find(buf);
+ if (itArgField == ArgFieldMapV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ // get the value of the argument field
+ status = getMetaBuf(value, &buf);
+
+ KernelArgMD* lcArg = static_cast(data);
+
+ switch (itArgField->second) {
+ case ArgField::Name:
+ lcArg->mName = buf;
+ break;
+ case ArgField::TypeName:
+ lcArg->mTypeName = buf;
+ break;
+ case ArgField::Size:
+ lcArg->mSize = atoi(buf.c_str());
+ break;
+ case ArgField::Offset:
+ lcArg->mOffset = atoi(buf.c_str());
+ break;
+ case ArgField::ValueKind:
+ {
+ auto itValueKind = ArgValueKindV3.find(buf);
+ if (itValueKind == ArgValueKindV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mValueKind = itValueKind->second;
+ }
+ break;
+ case ArgField::ValueType:
+ {
+ auto itValueType = ArgValueTypeV3.find(buf);
+ if (itValueType == ArgValueTypeV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mValueType = itValueType->second;
+ }
+ break;
+ case ArgField::PointeeAlign:
+ lcArg->mPointeeAlign = atoi(buf.c_str());
+ break;
+ case ArgField::AddrSpaceQual:
+ {
+ auto itAddrSpaceQual = ArgAddrSpaceQualV3.find(buf);
+ if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
+ }
+ break;
+ case ArgField::AccQual:
+ {
+ auto itAccQual = ArgAccQualV3.find(buf);
+ if (itAccQual == ArgAccQualV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mAccQual = itAccQual->second;
+ }
+ break;
+ case ArgField::ActualAccQual:
+ {
+ auto itAccQual = ArgAccQualV3.find(buf);
+ if (itAccQual == ArgAccQualV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ lcArg->mActualAccQual = itAccQual->second;
+ }
+ break;
+ case ArgField::IsConst:
+ lcArg->mIsConst = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsRestrict:
+ lcArg->mIsRestrict = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsVolatile:
+ lcArg->mIsVolatile = (buf.compare("true") == 0);
+ break;
+ case ArgField::IsPipe:
+ lcArg->mIsPipe = (buf.compare("true") == 0);
+ break;
+ default:
+ return AMD_COMGR_STATUS_ERROR;
+ }
+ return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateKernelMetaV3(const amd_comgr_metadata_node_t key,
+ const amd_comgr_metadata_node_t value,
+ void *data) {
+ amd_comgr_status_t status;
+ amd_comgr_metadata_kind_t kind;
+ size_t size = 0;
+ std::string buf;
+
+ // get the key of the argument field
+ status = amd::Comgr::get_metadata_kind(key, &kind);
+ if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(key, &buf);
+ }
+
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ auto itKernelField = KernelFieldMapV3.find(buf);
+ if (itKernelField == KernelFieldMapV3.end()) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ if (itKernelField->second != KernelField::ReqdWorkGroupSize &&
+ itKernelField->second != KernelField::WorkGroupSizeHint) {
+ status = getMetaBuf(value,&buf);
+ }
+ if (status != AMD_COMGR_STATUS_SUCCESS) {
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ KernelMD* kernelMD = static_cast(data);
+ switch (itKernelField->second) {
+ case KernelField::ReqdWorkGroupSize:
+ status = amd::Comgr::get_metadata_list_size(value, &size);
+ if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
+ for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+ amd_comgr_metadata_node_t workgroupSize;
+ status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
+
+ if (status == AMD_COMGR_STATUS_SUCCESS &&
+ getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
+ }
+ amd::Comgr::destroy_metadata(workgroupSize);
+ }
+ }
+ break;
+ case KernelField::WorkGroupSizeHint:
+ status = amd::Comgr::get_metadata_list_size(value, &size);
+ if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
+ for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+ amd_comgr_metadata_node_t workgroupSizeHint;
+ status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
+
+ if (status == AMD_COMGR_STATUS_SUCCESS &&
+ getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+ kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
+ }
+ amd::Comgr::destroy_metadata(workgroupSizeHint);
+ }
+ }
+ break;
+ case KernelField::VecTypeHint:
+ kernelMD->mAttrs.mVecTypeHint = buf;
+ break;
+ case KernelField::DeviceEnqueueSymbol:
+ kernelMD->mAttrs.mRuntimeHandle = buf;
+ break;
+ case KernelField::KernargSegmentSize:
+ kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
+ break;
+ case KernelField::GroupSegmentFixedSize:
+ kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
+ break;
+ case KernelField::PrivateSegmentFixedSize:
+ kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
+ break;
+ case KernelField::KernargSegmentAlign:
+ kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
+ break;
+ case KernelField::WavefrontSize:
+ kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
+ break;
+ case KernelField::NumSGPRs:
+ kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
+ break;
+ case KernelField::NumVGPRs:
+ kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
+ break;
+ case KernelField::MaxFlatWorkGroupSize:
+ kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
+ break;
+ case KernelField::NumSpilledSGPRs:
+ kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
+ break;
+ case KernelField::NumSpilledVGPRs:
+ kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
+ break;
+ case KernelField::SymbolName:
+ kernelMD->mSymbolName = buf;
+ break;
+ default:
+ return AMD_COMGR_STATUS_ERROR;
+ }
+
+ return status;
+}
+#endif
+
// ================================================================================================
-Kernel::Kernel(const amd::Device& dev, const std::string& name)
+Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& prog)
: dev_(dev)
, name_(name)
+ , prog_(prog)
, signature_(nullptr)
, waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
@@ -482,6 +959,8 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; }
+
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
#endif
@@ -771,11 +1250,10 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
-bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
- const uint32_t kernargSegmentByteSize,
- KernelMD* kernelMD) {
+bool Kernel::GetAttrCodePropMetadata( const amd_comgr_metadata_node_t kernelMetaNode,
+ KernelMD* kernelMD) {
- InitParameters(kernelMetaNode, kernargSegmentByteSize);
+ InitParameters(kernelMetaNode);
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
@@ -784,25 +1262,49 @@ bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaN
workGroupInfo_.availableVGPRs_ = 256;
// extract the attribute metadata if there is any
- amd_comgr_metadata_node_t attrMeta;
amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS;
- if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) {
- status = amd::Comgr::iterate_map_metadata(attrMeta, device::populateAttrs,
- static_cast(kernelMD));
- amd::Comgr::destroy_metadata(attrMeta);
+
+ switch (codeObjectVer()) {
+ case 2: {
+ amd_comgr_metadata_node_t symbolName;
+ status = amd::Comgr::metadata_lookup(kernelMetaNode, "SymbolName", &symbolName);
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ status = getMetaBuf(symbolName, &(kernelMD->mSymbolName));
+ amd::Comgr::destroy_metadata(symbolName);
+ }
+
+ amd_comgr_metadata_node_t attrMeta;
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) ==
+ AMD_COMGR_STATUS_SUCCESS) {
+ status = amd::Comgr::iterate_map_metadata(attrMeta, populateAttrs,
+ static_cast(kernelMD));
+ amd::Comgr::destroy_metadata(attrMeta);
+ }
+ }
+
+ // extract the code properties metadata
+ amd_comgr_metadata_node_t codePropsMeta;
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
+ }
+
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ status = amd::Comgr::iterate_map_metadata(codePropsMeta, populateCodeProps,
+ static_cast(kernelMD));
+ amd::Comgr::destroy_metadata(codePropsMeta);
+ }
+ }
+ break;
+ case 3: {
+ status = amd::Comgr::iterate_map_metadata(kernelMetaNode, populateKernelMetaV3,
+ static_cast(kernelMD));
+ }
+ break;
+ default:
+ return false;
}
- // extract the code properties metadata
- amd_comgr_metadata_node_t codePropsMeta;
- if (status == AMD_COMGR_STATUS_SUCCESS) {
- status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
- }
-
- if (status == AMD_COMGR_STATUS_SUCCESS) {
- status = amd::Comgr::iterate_map_metadata(codePropsMeta, device::populateCodeProps,
- static_cast(kernelMD));
- amd::Comgr::destroy_metadata(codePropsMeta);
- }
if (status != AMD_COMGR_STATUS_SUCCESS) {
return false;
@@ -917,19 +1419,21 @@ bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD,
return (status == AMD_COMGR_STATUS_SUCCESS);
}
-void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) {
+void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) {
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
- size_t offsetStruct = argBufferSize;
amd_comgr_metadata_node_t argsMeta;
bool hsaArgsMeta = false;
size_t argsSize;
- amd_comgr_status_t status = amd::Comgr::metadata_lookup(kernelMD, "Args", &argsMeta);
+ amd_comgr_status_t status = amd::Comgr::metadata_lookup(
+ kernelMD,
+ (codeObjectVer() == 2) ? "Args" : ".args",
+ &argsMeta);
if (status == AMD_COMGR_STATUS_SUCCESS) {
hsaArgsMeta = true;
status = amd::Comgr::get_metadata_list_size(argsMeta, &argsSize);
@@ -940,7 +1444,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
}
for (size_t i = 0; i < argsSize; ++i) {
- KernelArgMD lcArg;
+ KernelArgMD lcArg;
amd_comgr_metadata_node_t argsNode;
amd_comgr_metadata_kind_t kind;
@@ -956,7 +1460,13 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
status = AMD_COMGR_STATUS_ERROR;
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
- status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, static_cast(&lcArg));
+ void *data = static_cast(&lcArg);
+ if (codeObjectVer() == 2) {
+ status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data);
+ }
+ else if (codeObjectVer() == 3) {
+ status = amd::Comgr::iterate_map_metadata(argsNode, populateArgsV3, data);
+ }
}
if (hsaArgsNode) {
@@ -971,7 +1481,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
}
size_t size = GetArgSizeOCL(lcArg);
- size_t alignment = GetArgAlignmentOCL(lcArg);
+ size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0;
bool isHidden = false;
desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
@@ -980,7 +1490,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
setDynamicParallelFlag(true);
}
- offset = amd::alignUp(offset, alignment);
+ offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
desc.offset_ = offset;
desc.size_ = size;
offset += size;
@@ -1007,7 +1517,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
offset += sizeof(uint64_t);
}
else {
- offset = amd::alignUp(offset, alignment);
+ offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
desc.offset_ = offset;
offset += size;
}
diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp
index 6d1f289460..2ca7af5395 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.hpp
@@ -34,6 +34,7 @@ using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
using llvm::AMDGPU::HSAMD::ValueKind;
using llvm::AMDGPU::HSAMD::ValueType;
+// for Code Object V3
enum class ArgField : uint8_t {
Name = 0,
TypeName = 1,
@@ -48,7 +49,8 @@ enum class ArgField : uint8_t {
IsConst = 10,
IsRestrict = 11,
IsVolatile = 12,
- IsPipe = 13
+ IsPipe = 13,
+ Offset = 14
};
enum class AttrField : uint8_t {
@@ -167,6 +169,116 @@ static const std::map CodePropFieldMap =
{"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs},
{"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs}
};
+
+// for Code Object V3
+enum class KernelField : uint8_t {
+ SymbolName = 0,
+ ReqdWorkGroupSize = 1,
+ WorkGroupSizeHint = 2,
+ VecTypeHint = 3,
+ DeviceEnqueueSymbol = 4,
+ KernargSegmentSize = 5,
+ GroupSegmentFixedSize = 6,
+ PrivateSegmentFixedSize = 7,
+ KernargSegmentAlign = 8,
+ WavefrontSize = 9,
+ NumSGPRs = 10,
+ NumVGPRs = 11,
+ MaxFlatWorkGroupSize = 12,
+ NumSpilledSGPRs = 13,
+ NumSpilledVGPRs = 14
+};
+
+static const std::map ArgFieldMapV3 =
+{
+ {".name", ArgField::Name},
+ {".type_name", ArgField::TypeName},
+ {".size", ArgField::Size},
+ {".offset", ArgField::Offset},
+ {".value_kind", ArgField::ValueKind},
+ {".vaule_type", ArgField::ValueType},
+ {".pointee_align", ArgField::PointeeAlign},
+ {".address_space", ArgField::AddrSpaceQual},
+ {".access", ArgField::AccQual},
+ {".actual_access", ArgField::ActualAccQual},
+ {".is_const", ArgField::IsConst},
+ {".is_restrict", ArgField::IsRestrict},
+ {".is_volatile", ArgField::IsVolatile},
+ {".is_pipe", ArgField::IsPipe}
+};
+
+static const std::map ArgValueKindV3 =
+{
+ {"by_value", ValueKind::ByValue},
+ {"global_buffer", ValueKind::GlobalBuffer},
+ {"dynamic_shared_pointer", ValueKind::DynamicSharedPointer},
+ {"sampler", ValueKind::Sampler},
+ {"image", ValueKind::Image},
+ {"pipe", ValueKind::Pipe},
+ {"queue", ValueKind::Queue},
+ {"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX},
+ {"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY},
+ {"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ},
+ {"hidden_none", ValueKind::HiddenNone},
+ {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
+ {"hidden_default_queue", ValueKind::HiddenDefaultQueue},
+ {"hidden_completion_action", ValueKind::HiddenCompletionAction}
+};
+
+static const std::map ArgValueTypeV3 =
+{
+ {"struct", ValueType::Struct},
+ {"i8", ValueType::I8},
+ {"u8", ValueType::U8},
+ {"i16", ValueType::I16},
+ {"u16", ValueType::U16},
+ {"f16", ValueType::F16},
+ {"i32", ValueType::I32},
+ {"u32", ValueType::U32},
+ {"f32", ValueType::F32},
+ {"i64", ValueType::I64},
+ {"u64", ValueType::U64},
+ {"f64", ValueType::F64}
+};
+
+static const std::map ArgAccQualV3 =
+{
+ {"default", AccessQualifier::Default},
+ {"read_only", AccessQualifier::ReadOnly},
+ {"write_only", AccessQualifier::WriteOnly},
+ {"read_write", AccessQualifier::ReadWrite}
+};
+
+static const std::map ArgAddrSpaceQualV3 =
+{
+ {"private", AddressSpaceQualifier::Private},
+ {"global", AddressSpaceQualifier::Global},
+ {"constant", AddressSpaceQualifier::Constant},
+ {"local", AddressSpaceQualifier::Local},
+ {"generic", AddressSpaceQualifier::Generic},
+ {"region", AddressSpaceQualifier::Region}
+};
+
+static const std::map KernelFieldMapV3 =
+{
+ {".symbol", KernelField::SymbolName},
+ {".reqd_workgroup_size", KernelField::ReqdWorkGroupSize},
+ {".workgorup_size_hint", KernelField::WorkGroupSizeHint},
+ {".vec_type_hint", KernelField::VecTypeHint},
+ {".device_enqueue_symbol", KernelField::DeviceEnqueueSymbol},
+ {".kernarg_segment_size", KernelField::KernargSegmentSize},
+ {".group_segment_fixed_size", KernelField::GroupSegmentFixedSize},
+ {".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize},
+ {".kernarg_segment_align", KernelField::KernargSegmentAlign},
+ {".wavefront_size", KernelField::WavefrontSize},
+ {".sgpr_count", KernelField::NumSGPRs},
+ {".vgpr_count", KernelField::NumVGPRs},
+ {".max_flat_workgroup_size", KernelField::MaxFlatWorkGroupSize},
+ {".sgpr_spill_count", KernelField::NumSpilledSGPRs},
+ {".vgpr_spill_count", KernelField::NumSpilledVGPRs}
+};
+
+
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -234,6 +346,8 @@ struct KernelParameterDescriptor {
namespace device {
+class Program;
+
//! Printf info structure
struct PrintfInfo {
std::string fmtString_; //!< formated string for printf
@@ -272,7 +386,7 @@ class Kernel : public amd::HeapObject {
};
//! Default constructor
- Kernel(const amd::Device& dev, const std::string& name);
+ Kernel(const amd::Device& dev, const std::string& name, const Program& prog);
//! Default destructor
virtual ~Kernel();
@@ -372,7 +486,7 @@ class Kernel : public amd::HeapObject {
//! Initializes the abstraction layer kernel parameters
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
- void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize);
+ void InitParameters(const amd_comgr_metadata_node_t kernelMD);
//! Get ther kernel metadata
bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD,
@@ -381,7 +495,6 @@ class Kernel : public amd::HeapObject {
//! Retrieve kernel attribute and code properties metadata
bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
- const uint32_t kernargSegmentByteSize,
KernelMD* kernelMD);
//! Retrieve the available SGPRs and VGPRs
@@ -390,6 +503,12 @@ class Kernel : public amd::HeapObject {
//! Retrieve the printf string metadata
bool GetPrintfStr(const amd_comgr_metadata_node_t programMD,
std::vector* printfStr);
+
+ //! Returns the kernel symbol name
+ const std::string& symbolName() const { return symbolName_; }
+
+ //! Returns the kernel code object version
+ const uint32_t codeObjectVer() const { return prog().codeObjectVer(); }
#else
void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
#endif
@@ -404,8 +523,13 @@ class Kernel : public amd::HeapObject {
//! Initializes HSAIL Printf metadata and info
void InitPrintf(const aclPrintfFmt* aclPrintf);
#endif
+ //! Returns program associated with this kernel
+ const Program& prog() const { return prog_; }
+
const amd::Device& dev_; //!< GPU device object
std::string name_; //!< kernel name
+ const Program& prog_; //!< Reference to the parent program
+ std::string symbolName_; //!< kernel symbol name
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
amd::KernelSignature* signature_; //!< kernel signature
std::string buildLog_; //!< build log
@@ -424,6 +548,7 @@ class Kernel : public amd::HeapObject {
Flags() : value_(0) {}
} flags_;
+
private:
//! Disable default copy constructor
Kernel(const Kernel&);
@@ -447,264 +572,5 @@ static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta,
return status;
}
-
-static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
- const amd_comgr_metadata_node_t value,
- void *data) {
- amd_comgr_status_t status;
- amd_comgr_metadata_kind_t kind;
- std::string buf;
-
- // get the key of the argument field
- size_t size = 0;
- status = amd::Comgr::get_metadata_kind(key, &kind);
- if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
- status = getMetaBuf(key, &buf);
- }
-
- if (status != AMD_COMGR_STATUS_SUCCESS) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- auto itArgField = ArgFieldMap.find(buf);
- if (itArgField == ArgFieldMap.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- // get the value of the argument field
- status = getMetaBuf(value, &buf);
-
- KernelArgMD* lcArg = static_cast(data);
-
- switch (itArgField->second) {
- case ArgField::Name:
- lcArg->mName = buf;
- break;
- case ArgField::TypeName:
- lcArg->mTypeName = buf;
- break;
- case ArgField::Size:
- lcArg->mSize = atoi(buf.c_str());
- break;
- case ArgField::Align:
- lcArg->mAlign = atoi(buf.c_str());
- break;
- case ArgField::ValueKind:
- {
- auto itValueKind = ArgValueKind.find(buf);
- if (itValueKind == ArgValueKind.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
- lcArg->mValueKind = itValueKind->second;
- }
- break;
- case ArgField::ValueType:
- {
- auto itValueType = ArgValueType.find(buf);
- if (itValueType == ArgValueType.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
- lcArg->mValueType = itValueType->second;
- }
- break;
- case ArgField::PointeeAlign:
- lcArg->mPointeeAlign = atoi(buf.c_str());
- break;
- case ArgField::AddrSpaceQual:
- {
- auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
- if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
- lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
- }
- break;
- case ArgField::AccQual:
- {
- auto itAccQual = ArgAccQual.find(buf);
- if (itAccQual == ArgAccQual.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
- lcArg->mAccQual = itAccQual->second;
- }
- break;
- case ArgField::ActualAccQual:
- {
- auto itAccQual = ArgAccQual.find(buf);
- if (itAccQual == ArgAccQual.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
- lcArg->mActualAccQual = itAccQual->second;
- }
- break;
- case ArgField::IsConst:
- lcArg->mIsConst = (buf.compare("true") == 0);
- break;
- case ArgField::IsRestrict:
- lcArg->mIsRestrict = (buf.compare("true") == 0);
- break;
- case ArgField::IsVolatile:
- lcArg->mIsVolatile = (buf.compare("true") == 0);
- break;
- case ArgField::IsPipe:
- lcArg->mIsPipe = (buf.compare("true") == 0);
- break;
- default:
- return AMD_COMGR_STATUS_ERROR;
- }
- return AMD_COMGR_STATUS_SUCCESS;
-}
-
-static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
- const amd_comgr_metadata_node_t value,
- void *data) {
- amd_comgr_status_t status;
- amd_comgr_metadata_kind_t kind;
- size_t size = 0;
- std::string buf;
-
- // get the key of the argument field
- status = amd::Comgr::get_metadata_kind(key, &kind);
- if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
- status = getMetaBuf(key, &buf);
- }
-
- if (status != AMD_COMGR_STATUS_SUCCESS) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- auto itAttrField = AttrFieldMap.find(buf);
- if (itAttrField == AttrFieldMap.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- KernelMD* kernelMD = static_cast(data);
- switch (itAttrField->second) {
- case AttrField::ReqdWorkGroupSize:
- {
- status = amd::Comgr::get_metadata_list_size(value, &size);
- if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
- for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
- amd_comgr_metadata_node_t workgroupSize;
- status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
-
- if (status == AMD_COMGR_STATUS_SUCCESS &&
- getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
- kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
- }
- amd::Comgr::destroy_metadata(workgroupSize);
- }
- }
- }
- break;
- case AttrField::WorkGroupSizeHint:
- {
- status = amd::Comgr::get_metadata_list_size(value, &size);
- if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
- for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
- amd_comgr_metadata_node_t workgroupSizeHint;
- status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
-
- if (status == AMD_COMGR_STATUS_SUCCESS &&
- getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
- kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
- }
- amd::Comgr::destroy_metadata(workgroupSizeHint);
- }
- }
- }
- break;
- case AttrField::VecTypeHint:
- {
- if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
- kernelMD->mAttrs.mVecTypeHint = buf;
- }
- }
- break;
- case AttrField::RuntimeHandle:
- {
- if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
- kernelMD->mAttrs.mRuntimeHandle = buf;
- }
- }
- break;
- default:
- return AMD_COMGR_STATUS_ERROR;
- }
-
- return status;
-}
-
-static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
- const amd_comgr_metadata_node_t value,
- void *data) {
- amd_comgr_status_t status;
- amd_comgr_metadata_kind_t kind;
- std::string buf;
-
- // get the key of the argument field
- status = amd::Comgr::get_metadata_kind(key, &kind);
- if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
- status = getMetaBuf(key, &buf);
- }
-
- if (status != AMD_COMGR_STATUS_SUCCESS) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- auto itCodePropField = CodePropFieldMap.find(buf);
- if (itCodePropField == CodePropFieldMap.end()) {
- return AMD_COMGR_STATUS_ERROR;
- }
-
- // get the value of the argument field
- if (status == AMD_COMGR_STATUS_SUCCESS) {
- status = getMetaBuf(value, &buf);
- }
-
- KernelMD* kernelMD = static_cast(data);
- switch (itCodePropField->second) {
- case CodePropField::KernargSegmentSize:
- kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
- break;
- case CodePropField::GroupSegmentFixedSize:
- kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
- break;
- case CodePropField::PrivateSegmentFixedSize:
- kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
- break;
- case CodePropField::KernargSegmentAlign:
- kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
- break;
- case CodePropField::WavefrontSize:
- kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
- break;
- case CodePropField::NumSGPRs:
- kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
- break;
- case CodePropField::NumVGPRs:
- kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
- break;
- case CodePropField::MaxFlatWorkGroupSize:
- kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
- break;
- case CodePropField::IsDynamicCallStack:
- kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
- break;
- case CodePropField::IsXNACKEnabled:
- kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
- break;
- case CodePropField::NumSpilledSGPRs:
- kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
- break;
- case CodePropField::NumSpilledVGPRs:
- kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
- break;
- default:
- return AMD_COMGR_STATUS_ERROR;
- }
- return AMD_COMGR_STATUS_SUCCESS;
-}
-#endif
-
+#endif // defined(USE_COMGR_LIBRARY)
} // namespace device
diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp
index d43d8daedc..f40ae71adb 100644
--- a/projects/clr/rocclr/runtime/device/devprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/devprogram.cpp
@@ -706,8 +706,6 @@ bool Program::compileImplLC(const std::string& sourceCode,
driverOptions.append(options->llvmOptions);
driverOptions.append(ProcessOptions(options));
- // Force object code v2.
- driverOptions.append(" -mno-code-object-v3");
// Set whole program mode
driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink");
@@ -1532,8 +1530,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
codegenOptions.append(" -mno-sram-ecc");
}
- // Force object code v2.
- codegenOptions.append(" -mno-code-object-v3");
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
@@ -1764,8 +1760,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
std::ostream_iterator(ostrstr, " "));
codegenOptions.append(" ").append(ostrstr.str());
- // Force object code v2.
- codegenOptions.append(" -mno-code-object-v3");
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
@@ -2880,8 +2874,19 @@ bool Program::createKernelMetadataMap() {
status = amd::Comgr::metadata_lookup(*metadata_, "Kernels", &kernelsMD);
if (status == AMD_COMGR_STATUS_SUCCESS) {
+ LogInfo("Using Code Object V2.");
hasKernelMD = true;
- status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
+ codeObjectVer_ = 2;
+ }
+ else {
+ status = amd::Comgr::metadata_lookup(*metadata_, "amdhsa.kernels", &kernelsMD);
+
+ if (status == AMD_COMGR_STATUS_SUCCESS) {
+ LogInfo("Using Code Object V3.");
+ hasKernelMD = true;
+ codeObjectVer_ = 3;
+ status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
+ }
}
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
@@ -2896,7 +2901,9 @@ bool Program::createKernelMetadataMap() {
if (status == AMD_COMGR_STATUS_SUCCESS) {
hasKernelNode = true;
- status = amd::Comgr::metadata_lookup(kernelNode, "Name", &nameMeta);
+ status = amd::Comgr::metadata_lookup(kernelNode,
+ (codeObjectVer() == 2) ? "Name" : ".name",
+ &nameMeta);
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
@@ -2970,9 +2977,10 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) {
buildLog_ += "Error: object code with old metadata is not supported\n";
return false;
}
- else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
- note->n_namesz == sizeof "AMD" &&
- !memcmp(name, "AMD", note->n_namesz)) {
+ else if ((note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
+ note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) ||
+ (note->n_type == 32 /* NT_AMD_AMDGPU_HSA_METADATA V3 */ &&
+ note->n_namesz == sizeof "AMDGPU" && !memcmp(name, "AMDGPU", note->n_namesz))) {
#if defined(USE_COMGR_LIBRARY)
amd_comgr_status_t status;
amd_comgr_data_t binaryData;
diff --git a/projects/clr/rocclr/runtime/device/devprogram.hpp b/projects/clr/rocclr/runtime/device/devprogram.hpp
index 8de0653a64..b35cb0771e 100644
--- a/projects/clr/rocclr/runtime/device/devprogram.hpp
+++ b/projects/clr/rocclr/runtime/device/devprogram.hpp
@@ -118,6 +118,7 @@ class Program : public amd::HeapObject {
#if defined(USE_COMGR_LIBRARY)
amd_comgr_metadata_node_t* metadata_; //!< COMgr metadata
+ uint32_t codeObjectVer_; //!< version of code object
std::map kernelMetadataMap_; //!< Map of kernel metadata
#else
CodeObjectMD* metadata_; //!< Runtime metadata
@@ -211,6 +212,8 @@ class Program : public amd::HeapObject {
auto it = kernelMetadataMap_.find(name);
return (it == kernelMetadataMap_.end()) ? nullptr : &(it->second);
}
+
+ const uint32_t codeObjectVer() const { return codeObjectVer_; }
#else
const CodeObjectMD* metadata() const { return metadata_; }
#endif
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index 8eb574eed6..5f861f2a6a 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -515,10 +515,9 @@ clk_value_type_t KernelArg::type() const {
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
const NullProgram& nullprog)
- : device::Kernel(gpuNullDev, name),
+ : device::Kernel(gpuNullDev, name, nullprog),
buildError_(CL_BUILD_PROGRAM_FAILURE),
gpuDev_(gpuNullDev),
- prog_(nullprog),
calRef_(NULL),
internal_(false),
flags_(0),
@@ -3027,9 +3026,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
uint extraArgsNum)
- : device::Kernel(prog->dev(), name),
+ : device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
- prog_(*prog),
index_(0),
code_(NULL),
codeSize_(0),
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index a60ada3dad..98d660c470 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -406,7 +406,7 @@ class NullKernel : public device::Kernel {
const NullDevice& nullDev() const { return gpuDev_; }
//! Returns GPU device object, associated with this kernel
- const NullProgram& nullProg() const { return prog_; }
+ const NullProgram& nullProg() const { return reinterpret_cast(prog_); }
//! Returns the kernel's build error
const cl_int buildError() const { return buildError_; }
@@ -455,7 +455,6 @@ class NullKernel : public device::Kernel {
std::string ilSource_; //!< IL source code of this kernel
const NullDevice& gpuDev_; //!< GPU device object
- const NullProgram& prog_; //!< Reference to the parent program
CalImageReference* calRef_; //!< CAL image reference for this kernel
bool internal_; //!< Runtime internal ker
@@ -847,7 +846,6 @@ class HSAILKernel : public device::Kernel {
std::vector arguments_; //!< Vector list of HSAIL Arguments
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
- const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
gpu::Memory* code_; //!< Memory object with ISA code
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index 7bbdbfbb5b..e23389876b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -26,37 +26,17 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
namespace pal {
-bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
- if (!sym) {
- return false;
- }
- if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) {
- return false;
- }
-
- if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
- reinterpret_cast(&codeSize_))) {
- return false;
- }
-
- amd_kernel_code_t* akc = &akc_;
- // Copy codeobject of this kernel from the program CPU segment
- memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
-
- size_t akc_align = 0;
- if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
- reinterpret_cast(&akc_align))) {
- return false;
- }
-
+void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
+ const uint32_t groupSegmentSize,
+ const uint16_t numSGPRs,
+ const uint16_t numVGPRs) {
workGroupInfo_.scratchRegs_ =
- amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
- workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
- workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
- akc->workgroup_group_segment_byte_size;
- workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
+ amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
+ workGroupInfo_.privateMemSize_ = privateSegmentSize;
+ workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
+ workGroupInfo_.usedSGPRs_ = numSGPRs;
workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
+ workGroupInfo_.usedVGPRs_ = numVGPRs;
if (!prog().isNull()) {
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
@@ -72,16 +52,57 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
workGroupInfo_.availableVGPRs_ = 256;
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
}
+}
+
+bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
+ if (!sym) {
+ return false;
+ }
+ if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) {
+ return false;
+ }
+
+ // Copy code object of this kernel from the program CPU segment
+ memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
+
+ return true;
+}
+
+bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
+
+ amd_kernel_code_t* akc = &akc_;
+
+ if (!setKernelCode(sym, akc)) {
+ return false;
+ }
+
+ if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
+ reinterpret_cast(&codeSize_))) {
+ return false;
+ }
+
+ // Setup the the workgroup info
+ setWorkGroupInfo(akc->workitem_private_segment_byte_size,
+ akc->workgroup_group_segment_byte_size,
+ akc->wavefront_sgpr_count,
+ akc->workitem_vgpr_count);
+
+ workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
+ kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
+ spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
+
return true;
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
- : device::Kernel(prog->dev(), name),
+ : device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
- prog_(*prog),
index_(0),
code_(0),
- codeSize_(0)
+ codeSize_(0),
+ workgroupGroupSegmentByteSize_(0),
+ kernargSegmentByteSize_(0),
+ spillSegmentByteSize_(0)
{
flags_.hsa_ = true;
}
@@ -388,37 +409,54 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
-bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
-#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#if defined(USE_COMGR_LIBRARY)
+bool LightningKernel::init() {
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
- aqlCreateHWInfo(symbol);
-
-#if defined(USE_COMGR_LIBRARY)
const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
if (kernelMetaNode == nullptr) {
return false;
}
KernelMD kernelMD;
- if (!GetAttrCodePropMetadata(*kernelMetaNode, argsBufferSize(), &kernelMD)) {
+ if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
+ symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
+
+ workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
+ spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
+ sizeof(uint32_t));
+ kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
+
+ // Copy codeobject of this kernel from the program CPU segment
+ hsa_agent_t agent;
+ agent.handle = 1;
+
+ auto sym = prog().GetSymbol(symbolName().c_str(), const_cast(&agent));
+
+ if (!setKernelCode(sym, &akc_)) {
+ return false;
+ }
+
+ codeSize_ = prog().codeSegGpu().owner()->getSize();
+
+ // handle device enqueue
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
hsa_agent_t agent;
agent.handle = 1;
amd::hsa::loader::Symbol* rth_symbol;
// Get the runtime handle symbol GPU address
- rth_symbol = prog_.GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()),
+ rth_symbol = prog().GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()),
const_cast(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
// Copy the kernel_object pointer to the runtime handle symbol GPU address
- const Memory& codeSegGpu = prog_.codeSegGpu();
+ const Memory& codeSegGpu = prog().codeSegGpu();
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
uint64_t kernel_object = gpuAqlCode();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
@@ -432,6 +470,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
+ // Setup the the workgroup info
+ setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
+ kernelMD.mCodeProps.mGroupSegmentFixedSize,
+ kernelMD.mCodeProps.mNumSGPRs,
+ kernelMD.mCodeProps.mNumVGPRs);
+
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
@@ -452,7 +496,18 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
if (!printfStr.empty()) {
InitPrintf(printfStr);
}
-#else
+
+ return true;
+}
+#endif // defined(USE_COMGR_LIBRARY)
+
+bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
+#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+ flags_.internalKernel_ =
+ (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
+
+ aqlCreateHWInfo(symbol);
+
const CodeObjectMD* programMD = prog().metadata();
assert(programMD != nullptr);
@@ -489,13 +544,13 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
amd::hsa::loader::Symbol* rth_symbol;
// Get the runtime handle symbol GPU address
- rth_symbol = prog_.GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()),
+ rth_symbol = prog().GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()),
const_cast(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
// Copy the kernel_object pointer to the runtime handle symbol GPU address
- const Memory& codeSegGpu = prog_.codeSegGpu();
+ const Memory& codeSegGpu = prog().codeSegGpu();
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
@@ -529,8 +584,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
waveLimiter_.enable();
*/
-#endif // defined(USE_COMGR_LIBRARY)
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
return true;
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index e5041f3f92..5a1abe07d4 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -59,7 +59,7 @@ class HSAILKernel : public device::Kernel {
const HSAILProgram& prog() const;
//! Returns LDS size used in this kernel
- uint32_t ldsSize() const { return akc_.workgroup_group_segment_byte_size; }
+ uint32_t ldsSize() const { return workgroupGroupSegmentByteSize_; }
//! Returns pointer on CPU to AQL code info
const amd_kernel_code_t* cpuAqlCode() const { return &akc_; }
@@ -71,10 +71,10 @@ class HSAILKernel : public device::Kernel {
size_t aqlCodeSize() const { return codeSize_; }
//! Returns the size of argument buffer
- size_t argsBufferSize() const { return akc_.kernarg_segment_byte_size; }
+ size_t argsBufferSize() const { return kernargSegmentByteSize_; }
//! Returns spill reg size per workitem
- uint32_t spillSegSize() const { return amd::alignUp(akc_.workitem_private_segment_byte_size, sizeof(uint32_t)); }
+ uint32_t spillSegSize() const { return spillSegmentByteSize_; }
//! Returns AQL packet in CPU memory
//! if the kernel arguments were successfully loaded, otherwise NULL
@@ -102,13 +102,25 @@ class HSAILKernel : public device::Kernel {
//! Creates AQL kernel HW info
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
+ //! Get the kernel code and copy the code object from the program CPU segment
+ bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
+
+ //! Set up the workgroup info based on the kernel metadata
+ void setWorkGroupInfo(const uint32_t privateSegmentSize,
+ const uint32_t groupSegmentSize,
+ const uint16_t numSGPRs,
+ const uint16_t numVGPRs);
+
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
- const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
+
+ uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
+ uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
+ uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
};
class LightningKernel : public HSAILKernel {
@@ -121,6 +133,11 @@ class LightningKernel : public HSAILKernel {
//! Initializes the metadata required for this kernel,
bool init(amd::hsa::loader::Symbol* symbol);
+
+#if defined(USE_COMGR_LIBRARY)
+ //! Initializes the metadata required for this kernel,
+ bool init();
+#endif
};
/*@}*/} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index 1098f8864d..ea2762c0ff 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -747,6 +747,25 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
return false;
}
+#if defined(USE_COMGR_LIBRARY)
+ for (const auto &kernelMeta : kernelMetadataMap_) {
+ auto kernelName = kernelMeta.first;
+ auto kernel = new LightningKernel(kernelName, this,
+ options->origOptionStr + ProcessOptions(options));
+ kernels()[kernelName] = kernel;
+
+ if (!kernel->init()) {
+ return false;
+ }
+
+ kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+
+ // Find max scratch regs used in the program. It's used for scratch buffer preallocation
+ // with dynamic parallelism, since runtime doesn't know which child kernel will be called
+ maxScratchRegs_ =
+ std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
+ }
+#else
// Get the list of kernels
std::vector kernelNameList;
status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList);
@@ -781,7 +800,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
maxScratchRegs_ =
std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
-
+#endif // defined(USE_COMGR_LIBRARY)
DestroySegmentCpuAccess();
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
index 65d14d2411..7118a6f3c7 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
@@ -166,7 +166,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
}
// Update GPU specific settings and info structure if we have any
+#if defined(_WIN32)
+ ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
+#else
ModifyMaxWorkload modifyMaxWorkload = {0};
+#endif
// APU systems
if (palProp.gpuType == Pal::GpuType::Integrated) {
@@ -326,18 +330,22 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
return false;
}
-#if defined(_WIN32)
- if (modifyMaxWorkload.time > 0) {
- OSVERSIONINFOEX versionInfo = {0};
- versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
- versionInfo.dwMajorVersion = 6;
- versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
+ splitSizeForWin7_ = false;
- DWORDLONG conditionMask = 0;
- VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
- VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
- if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
- maxWorkloadTime_ = modifyMaxWorkload.time;
+#if defined(_WIN32)
+ OSVERSIONINFOEX versionInfo = {0};
+ versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+ versionInfo.dwMajorVersion = 6;
+ versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
+
+ DWORDLONG conditionMask = 0;
+ VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
+ VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
+
+ if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
+ splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
+ if (modifyMaxWorkload.time > 0) {
+ maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
}
}
#endif // defined(_WIN32)
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
index 63f61ae99a..b6e1d95441 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
@@ -60,7 +60,8 @@ class Settings : public device::Settings {
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
uint rgpSqttForceDisable_: 1; //!< Disables SQTT
- uint reserved_ : 12;
+ uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
+ uint reserved_ : 11;
};
uint value_;
};
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 359331cc85..13d107baa2 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -553,6 +553,11 @@ void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
uint instructions) {
+ if (!dev.settings().splitSizeForWin7_) {
+ dispatchSplitSize_ = 0;
+ return;
+ }
+
uint64_t workload = threads * instructions;
if (maxDispatchWorkload_ < workload) {
dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions);
@@ -2258,6 +2263,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
+ dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
+ dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
// Run AQL dispatch in HW
eventBegin(MainEngine);
iCmd()->CmdDispatchAql(dispatchParam);
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index c05197fe5d..8a28accfe4 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -25,41 +25,58 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
- : device::Kernel(prog->dev(), name),
- program_(prog),
+ : device::Kernel(prog->dev(), name, *prog),
kernelCodeHandle_(kernelCodeHandle),
workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize),
kernargSegmentByteSize_(kernargSegmentByteSize),
kernargSegmentAlignment_(kernargSegmentAlignment) {}
+Kernel::Kernel(std::string name, Program* prog)
+ : device::Kernel(prog->dev(), name, *prog),
+ kernelCodeHandle_(0),
+ workgroupGroupSegmentByteSize_(0),
+ workitemPrivateSegmentByteSize_(0),
+ kernargSegmentByteSize_(0),
+ kernargSegmentAlignment_(0) {}
+
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
- hsa_agent_t hsaDevice = program_->hsaDevice();
+ hsa_agent_t hsaDevice = program()->hsaDevice();
const amd_comgr_metadata_node_t* kernelMetaNode =
- static_cast(program_)->getKernelMetadata(name());
+ static_cast(program())->getKernelMetadata(name());
if (kernelMetaNode == nullptr) {
return false;
}
KernelMD kernelMD;
- if (!GetAttrCodePropMetadata(*kernelMetaNode, KernargSegmentByteSize(), &kernelMD)) {
+ if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
+ // Set the kernel symbol name and size/alignment based on the kernel metadata
+ // NOTE: kernel name is used to get the kernel code handle in V2,
+ // but kernel symbol name is used in V3
+ symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
+ workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
+ workitemPrivateSegmentByteSize_ = kernelMD.mCodeProps.mPrivateSegmentFixedSize;
+ kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
+ kernargSegmentAlignment_ = amd::alignUp(std::max(kernelMD.mCodeProps.mKernargSegmentAlign, 128u),
+ dev().info().globalMemCacheLineSize_);
+
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
// Get the available SGPRs and VGPRs
- std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget();
- if (program_->xnackEnable()) {
+ std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program()->machineTarget();
+ if (program()->xnackEnable()) {
targetIdent.append("+xnack");
}
- if (program_->sramEccEnable()) {
+ if (program()->sramEccEnable()) {
targetIdent.append("+sram-ecc");
}
@@ -67,10 +84,23 @@ bool LightningKernel::init() {
return false;
}
+ // Get the kernel code handle
+ hsa_status_t hsaStatus;
+ hsa_executable_symbol_t symbol;
+ hsa_agent_t agent = program()->hsaDevice();
+ hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
+ symbolName().c_str(),
+ &agent, &symbol);
+ if (hsaStatus == HSA_STATUS_SUCCESS) {
+ hsaStatus = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+ &kernelCodeHandle_);
+ }
+ if (hsaStatus != HSA_STATUS_SUCCESS) {
+ return false;
+ }
+
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
- hsa_agent_t agent = program_->hsaDevice();
hsa_executable_symbol_t kernelSymbol;
- hsa_status_t hsaStatus;
int variable_size;
uint64_t variable_address;
@@ -79,7 +109,7 @@ bool LightningKernel::init() {
// object handle of such a kernel. The address of the variable and the kernel code object handle are known
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
// address of the variable.
- hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(),
+ hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
kernelMD.mAttrs.mRuntimeHandle.c_str(),
&agent, &kernelSymbol);
if (hsaStatus == HSA_STATUS_SUCCESS) {
@@ -87,11 +117,6 @@ bool LightningKernel::init() {
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
&variable_size);
}
- if (hsaStatus == HSA_STATUS_SUCCESS) {
- hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
- HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
- &variable_size);
- }
if (hsaStatus == HSA_STATUS_SUCCESS) {
hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
@@ -114,7 +139,7 @@ bool LightningKernel::init() {
}
uint32_t wavefront_size = 0;
- if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
+ if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
HSA_STATUS_SUCCESS) {
return false;
}
@@ -127,7 +152,7 @@ bool LightningKernel::init() {
workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs;
workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs;
workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+ workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
if (workGroupInfo_.size_ == 0) {
@@ -135,7 +160,7 @@ bool LightningKernel::init() {
}
// handle the printf metadata if any
- const amd_comgr_metadata_node_t* programMD = static_cast(program_)->metadata();
+ const amd_comgr_metadata_node_t* programMD = static_cast(program())->metadata();
assert(programMD != nullptr);
std::vector printfStr;
@@ -159,10 +184,10 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
bool LightningKernel::init() {
- hsa_agent_t hsaDevice = program_->hsaDevice();
+ hsa_agent_t hsaDevice = program()->hsaDevice();
// Pull out metadata from the ELF
- const CodeObjectMD* programMD = static_cast(program_)->metadata();
+ const CodeObjectMD* programMD = static_cast(program())->metadata();
assert(programMD != nullptr);
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
@@ -172,7 +197,7 @@ bool LightningKernel::init() {
InitParameters(*kernelMD, KernargSegmentByteSize());
// Set the workgroup information for the kernel
- workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
+ workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
workGroupInfo_.availableSGPRs_ = 104;
workGroupInfo_.availableVGPRs_ = 256;
@@ -196,7 +221,7 @@ bool LightningKernel::init() {
}
if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
- hsa_agent_t agent = program_->hsaDevice();
+ hsa_agent_t agent = program()->hsaDevice();
hsa_executable_symbol_t kernelSymbol;
hsa_status_t status;
int variable_size;
@@ -208,7 +233,7 @@ bool LightningKernel::init() {
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
// address of the variable.
- status = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
+ status = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
&agent, &kernelSymbol);
if (status != HSA_STATUS_SUCCESS) {
return false;
@@ -239,7 +264,7 @@ bool LightningKernel::init() {
}
uint32_t wavefront_size = 0;
- if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
+ if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
HSA_STATUS_SUCCESS) {
return false;
}
@@ -258,7 +283,7 @@ bool LightningKernel::init() {
workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+ workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
@@ -278,18 +303,18 @@ bool LightningKernel::init() {
bool HSAILKernel::init() {
acl_error errorCode;
// compile kernel down to ISA
- hsa_agent_t hsaDevice = program_->hsaDevice();
+ hsa_agent_t hsaDevice = program()->hsaDevice();
// Pull out metadata from the ELF
size_t sizeOfArgList;
- aclCompiler* compileHandle = program_->dev().compiler();
+ aclCompiler* compileHandle = program()->dev().compiler();
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfArgList);
if (errorCode != ACL_SUCCESS) {
return false;
}
std::unique_ptr argList(new char[sizeOfArgList]);
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), argList.get(), &sizeOfArgList);
if (errorCode != ACL_SUCCESS) {
return false;
@@ -300,17 +325,17 @@ bool HSAILKernel::init() {
// Set the workgroup information for the kernel
memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
- workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
+ workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
workGroupInfo_.availableSGPRs_ = 104;
workGroupInfo_.availableVGPRs_ = 256;
size_t sizeOfWorkGroupSize;
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
if (errorCode != ACL_SUCCESS) {
return false;
}
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), workGroupInfo_.compileSize_,
&sizeOfWorkGroupSize);
if (errorCode != ACL_SUCCESS) {
@@ -319,7 +344,7 @@ bool HSAILKernel::init() {
uint32_t wavefront_size = 0;
if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
+ hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
return false;
}
assert(wavefront_size > 0);
@@ -344,18 +369,18 @@ bool HSAILKernel::init() {
}
workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+ workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
} else {
- workGroupInfo_.size_ = program_->dev().info().preferredWorkGroupSize_;
+ workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
if (errorCode != ACL_SUCCESS) {
return false;
@@ -367,7 +392,7 @@ bool HSAILKernel::init() {
if (!aclPrintfList) {
return false;
}
- errorCode = aclQueryInfo(compileHandle, program_->binaryElf(),
+ errorCode = aclQueryInfo(compileHandle, program()->binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(),
aclPrintfList.get(), &sizeOfPrintfList);
if (errorCode != ACL_SUCCESS) {
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
index 8bf6ed674c..46fc1c2339 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
@@ -22,6 +22,8 @@ class Kernel : public device::Kernel {
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment);
+ Kernel(std::string name, Program* prog);
+
const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; }
const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; }
@@ -37,15 +39,15 @@ class Kernel : public device::Kernel {
//! Initializes the metadata required for this kernel
virtual bool init() = 0;
- const Program* program() const { return static_cast(program_); }
+ const Program* program() const { return static_cast(&prog_); }
protected:
- Program* program_; //!< The roc::Program context
+// Program* program_; //!< The roc::Program context
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
- const uint32_t workgroupGroupSegmentByteSize_;
- const uint32_t workitemPrivateSegmentByteSize_;
- const uint32_t kernargSegmentByteSize_;
- const uint32_t kernargSegmentAlignment_;
+ uint32_t workgroupGroupSegmentByteSize_;
+ uint32_t workitemPrivateSegmentByteSize_;
+ uint32_t kernargSegmentByteSize_;
+ uint32_t kernargSegmentAlignment_;
size_t kernelDirectiveOffset_;
};
@@ -74,6 +76,10 @@ class LightningKernel : public roc::Kernel {
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
}
+
+ LightningKernel(std::string name, Program* prog)
+ : roc::Kernel(name, prog) {}
+
//! Initializes the metadata required for this kernel
virtual bool init() final;
};
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
index 444377aeb2..191e58c0e7 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
@@ -488,6 +488,19 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
return false;
}
+#if defined(USE_COMGR_LIBRARY)
+ for (const auto &kernelMeta : kernelMetadataMap_) {
+ const std::string kernelName = kernelMeta.first;
+ Kernel* aKernel = new roc::LightningKernel(kernelName, this);
+ if (!aKernel->init()) {
+ return false;
+ }
+ aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+ aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") !=
+ std::string::npos);
+ kernels()[kernelName] = aKernel;
+ }
+#else
// Get the list of kernels
std::vector kernelNameList;
status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback,
@@ -582,6 +595,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
std::string::npos);
kernels()[kernelName] = aKernel;
}
+#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}