P4 to Git Change 1767752 by wchau@wchau_OCL_Linux on 2019/04/09 22:58:03

SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata
	- Add support for the V3 code objects

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#336 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#134 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/inc/core/palCmdBuffer.h#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx6/gfx6ComputeCmdBuffer.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx9/gfx9ComputeCmdBuffer.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#77 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#130 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#103 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#47 edit


[ROCm/clr commit: 36a5f2a85f]
이 커밋은 다음에 포함됨:
foreman
2019-04-09 23:24:10 -04:00
부모 293408dca8
커밋 d1f62b92c1
15개의 변경된 파일951개의 추가작업 그리고 417개의 파일을 삭제
+539 -29
파일 보기
@@ -29,10 +29,487 @@ using llvm::AMDGPU::HSAMD::ValueType;
namespace device {
#if defined(USE_COMGR_LIBRARY)
static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
std::string buf;
// get the key of the argument field
size_t size = 0;
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itArgField = ArgFieldMap.find(buf);
if (itArgField == ArgFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
// get the value of the argument field
status = getMetaBuf(value, &buf);
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
switch (itArgField->second) {
case ArgField::Name:
lcArg->mName = buf;
break;
case ArgField::TypeName:
lcArg->mTypeName = buf;
break;
case ArgField::Size:
lcArg->mSize = atoi(buf.c_str());
break;
case ArgField::Align:
lcArg->mAlign = atoi(buf.c_str());
break;
case ArgField::ValueKind:
{
auto itValueKind = ArgValueKind.find(buf);
if (itValueKind == ArgValueKind.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueKind = itValueKind->second;
}
break;
case ArgField::ValueType:
{
auto itValueType = ArgValueType.find(buf);
if (itValueType == ArgValueType.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueType = itValueType->second;
}
break;
case ArgField::PointeeAlign:
lcArg->mPointeeAlign = atoi(buf.c_str());
break;
case ArgField::AddrSpaceQual:
{
auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
}
break;
case ArgField::AccQual:
{
auto itAccQual = ArgAccQual.find(buf);
if (itAccQual == ArgAccQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAccQual = itAccQual->second;
}
break;
case ArgField::ActualAccQual:
{
auto itAccQual = ArgAccQual.find(buf);
if (itAccQual == ArgAccQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mActualAccQual = itAccQual->second;
}
break;
case ArgField::IsConst:
lcArg->mIsConst = (buf.compare("true") == 0);
break;
case ArgField::IsRestrict:
lcArg->mIsRestrict = (buf.compare("true") == 0);
break;
case ArgField::IsVolatile:
lcArg->mIsVolatile = (buf.compare("true") == 0);
break;
case ArgField::IsPipe:
lcArg->mIsPipe = (buf.compare("true") == 0);
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return AMD_COMGR_STATUS_SUCCESS;
}
static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
size_t size = 0;
std::string buf;
// get the key of the argument field
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itAttrField = AttrFieldMap.find(buf);
if (itAttrField == AttrFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
KernelMD* kernelMD = static_cast<KernelMD*>(data);
switch (itAttrField->second) {
case AttrField::ReqdWorkGroupSize:
{
status = amd::Comgr::get_metadata_list_size(value, &size);
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSize;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSize);
}
}
}
break;
case AttrField::WorkGroupSizeHint:
{
status = amd::Comgr::get_metadata_list_size(value, &size);
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSizeHint;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSizeHint);
}
}
}
break;
case AttrField::VecTypeHint:
{
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mVecTypeHint = buf;
}
}
break;
case AttrField::RuntimeHandle:
{
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mRuntimeHandle = buf;
}
}
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return status;
}
static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
std::string buf;
// get the key of the argument field
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itCodePropField = CodePropFieldMap.find(buf);
if (itCodePropField == CodePropFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
// get the value of the argument field
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(value, &buf);
}
KernelMD* kernelMD = static_cast<KernelMD*>(data);
switch (itCodePropField->second) {
case CodePropField::KernargSegmentSize:
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
break;
case CodePropField::GroupSegmentFixedSize:
kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
break;
case CodePropField::PrivateSegmentFixedSize:
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
break;
case CodePropField::KernargSegmentAlign:
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
break;
case CodePropField::WavefrontSize:
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
break;
case CodePropField::NumSGPRs:
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
break;
case CodePropField::NumVGPRs:
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
break;
case CodePropField::MaxFlatWorkGroupSize:
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
break;
case CodePropField::IsDynamicCallStack:
kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
break;
case CodePropField::IsXNACKEnabled:
kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
break;
case CodePropField::NumSpilledSGPRs:
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
break;
case CodePropField::NumSpilledVGPRs:
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return AMD_COMGR_STATUS_SUCCESS;
}
static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
std::string buf;
// get the key of the argument field
size_t size = 0;
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itArgField = ArgFieldMapV3.find(buf);
if (itArgField == ArgFieldMapV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
// get the value of the argument field
status = getMetaBuf(value, &buf);
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
switch (itArgField->second) {
case ArgField::Name:
lcArg->mName = buf;
break;
case ArgField::TypeName:
lcArg->mTypeName = buf;
break;
case ArgField::Size:
lcArg->mSize = atoi(buf.c_str());
break;
case ArgField::Offset:
lcArg->mOffset = atoi(buf.c_str());
break;
case ArgField::ValueKind:
{
auto itValueKind = ArgValueKindV3.find(buf);
if (itValueKind == ArgValueKindV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueKind = itValueKind->second;
}
break;
case ArgField::ValueType:
{
auto itValueType = ArgValueTypeV3.find(buf);
if (itValueType == ArgValueTypeV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueType = itValueType->second;
}
break;
case ArgField::PointeeAlign:
lcArg->mPointeeAlign = atoi(buf.c_str());
break;
case ArgField::AddrSpaceQual:
{
auto itAddrSpaceQual = ArgAddrSpaceQualV3.find(buf);
if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
}
break;
case ArgField::AccQual:
{
auto itAccQual = ArgAccQualV3.find(buf);
if (itAccQual == ArgAccQualV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAccQual = itAccQual->second;
}
break;
case ArgField::ActualAccQual:
{
auto itAccQual = ArgAccQualV3.find(buf);
if (itAccQual == ArgAccQualV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mActualAccQual = itAccQual->second;
}
break;
case ArgField::IsConst:
lcArg->mIsConst = (buf.compare("true") == 0);
break;
case ArgField::IsRestrict:
lcArg->mIsRestrict = (buf.compare("true") == 0);
break;
case ArgField::IsVolatile:
lcArg->mIsVolatile = (buf.compare("true") == 0);
break;
case ArgField::IsPipe:
lcArg->mIsPipe = (buf.compare("true") == 0);
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return AMD_COMGR_STATUS_SUCCESS;
}
static amd_comgr_status_t populateKernelMetaV3(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
size_t size = 0;
std::string buf;
// get the key of the argument field
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itKernelField = KernelFieldMapV3.find(buf);
if (itKernelField == KernelFieldMapV3.end()) {
return AMD_COMGR_STATUS_ERROR;
}
if (itKernelField->second != KernelField::ReqdWorkGroupSize &&
itKernelField->second != KernelField::WorkGroupSizeHint) {
status = getMetaBuf(value,&buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
KernelMD* kernelMD = static_cast<KernelMD*>(data);
switch (itKernelField->second) {
case KernelField::ReqdWorkGroupSize:
status = amd::Comgr::get_metadata_list_size(value, &size);
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSize;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSize);
}
}
break;
case KernelField::WorkGroupSizeHint:
status = amd::Comgr::get_metadata_list_size(value, &size);
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSizeHint;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSizeHint);
}
}
break;
case KernelField::VecTypeHint:
kernelMD->mAttrs.mVecTypeHint = buf;
break;
case KernelField::DeviceEnqueueSymbol:
kernelMD->mAttrs.mRuntimeHandle = buf;
break;
case KernelField::KernargSegmentSize:
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
break;
case KernelField::GroupSegmentFixedSize:
kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
break;
case KernelField::PrivateSegmentFixedSize:
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
break;
case KernelField::KernargSegmentAlign:
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
break;
case KernelField::WavefrontSize:
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
break;
case KernelField::NumSGPRs:
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
break;
case KernelField::NumVGPRs:
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
break;
case KernelField::MaxFlatWorkGroupSize:
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
break;
case KernelField::NumSpilledSGPRs:
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
break;
case KernelField::NumSpilledVGPRs:
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
break;
case KernelField::SymbolName:
kernelMD->mSymbolName = buf;
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return status;
}
#endif
// ================================================================================================
Kernel::Kernel(const amd::Device& dev, const std::string& name)
Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& prog)
: dev_(dev)
, name_(name)
, prog_(prog)
, signature_(nullptr)
, waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
@@ -482,6 +959,8 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; }
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
#endif
@@ -771,11 +1250,10 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
// ================================================================================================
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
const uint32_t kernargSegmentByteSize,
KernelMD* kernelMD) {
bool Kernel::GetAttrCodePropMetadata( const amd_comgr_metadata_node_t kernelMetaNode,
KernelMD* kernelMD) {
InitParameters(kernelMetaNode, kernargSegmentByteSize);
InitParameters(kernelMetaNode);
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
@@ -784,25 +1262,49 @@ bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaN
workGroupInfo_.availableVGPRs_ = 256;
// extract the attribute metadata if there is any
amd_comgr_metadata_node_t attrMeta;
amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS;
if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::iterate_map_metadata(attrMeta, device::populateAttrs,
static_cast<void*>(kernelMD));
amd::Comgr::destroy_metadata(attrMeta);
switch (codeObjectVer()) {
case 2: {
amd_comgr_metadata_node_t symbolName;
status = amd::Comgr::metadata_lookup(kernelMetaNode, "SymbolName", &symbolName);
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(symbolName, &(kernelMD->mSymbolName));
amd::Comgr::destroy_metadata(symbolName);
}
amd_comgr_metadata_node_t attrMeta;
if (status == AMD_COMGR_STATUS_SUCCESS) {
if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) ==
AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::iterate_map_metadata(attrMeta, populateAttrs,
static_cast<void*>(kernelMD));
amd::Comgr::destroy_metadata(attrMeta);
}
}
// extract the code properties metadata
amd_comgr_metadata_node_t codePropsMeta;
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::iterate_map_metadata(codePropsMeta, populateCodeProps,
static_cast<void*>(kernelMD));
amd::Comgr::destroy_metadata(codePropsMeta);
}
}
break;
case 3: {
status = amd::Comgr::iterate_map_metadata(kernelMetaNode, populateKernelMetaV3,
static_cast<void*>(kernelMD));
}
break;
default:
return false;
}
// extract the code properties metadata
amd_comgr_metadata_node_t codePropsMeta;
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::iterate_map_metadata(codePropsMeta, device::populateCodeProps,
static_cast<void*>(kernelMD));
amd::Comgr::destroy_metadata(codePropsMeta);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return false;
@@ -917,19 +1419,21 @@ bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD,
return (status == AMD_COMGR_STATUS_SUCCESS);
}
void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) {
void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) {
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
size_t offsetStruct = argBufferSize;
amd_comgr_metadata_node_t argsMeta;
bool hsaArgsMeta = false;
size_t argsSize;
amd_comgr_status_t status = amd::Comgr::metadata_lookup(kernelMD, "Args", &argsMeta);
amd_comgr_status_t status = amd::Comgr::metadata_lookup(
kernelMD,
(codeObjectVer() == 2) ? "Args" : ".args",
&argsMeta);
if (status == AMD_COMGR_STATUS_SUCCESS) {
hsaArgsMeta = true;
status = amd::Comgr::get_metadata_list_size(argsMeta, &argsSize);
@@ -940,7 +1444,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
}
for (size_t i = 0; i < argsSize; ++i) {
KernelArgMD lcArg;
KernelArgMD lcArg;
amd_comgr_metadata_node_t argsNode;
amd_comgr_metadata_kind_t kind;
@@ -956,7 +1460,13 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
status = AMD_COMGR_STATUS_ERROR;
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, static_cast<void*>(&lcArg));
void *data = static_cast<void*>(&lcArg);
if (codeObjectVer() == 2) {
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data);
}
else if (codeObjectVer() == 3) {
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgsV3, data);
}
}
if (hsaArgsNode) {
@@ -971,7 +1481,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
}
size_t size = GetArgSizeOCL(lcArg);
size_t alignment = GetArgAlignmentOCL(lcArg);
size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0;
bool isHidden = false;
desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
@@ -980,7 +1490,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
setDynamicParallelFlag(true);
}
offset = amd::alignUp(offset, alignment);
offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
desc.offset_ = offset;
desc.size_ = size;
offset += size;
@@ -1007,7 +1517,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
offset += sizeof(uint64_t);
}
else {
offset = amd::alignUp(offset, alignment);
offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
desc.offset_ = offset;
offset += size;
}
+130 -264
파일 보기
@@ -34,6 +34,7 @@ using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
using llvm::AMDGPU::HSAMD::ValueKind;
using llvm::AMDGPU::HSAMD::ValueType;
// for Code Object V3
enum class ArgField : uint8_t {
Name = 0,
TypeName = 1,
@@ -48,7 +49,8 @@ enum class ArgField : uint8_t {
IsConst = 10,
IsRestrict = 11,
IsVolatile = 12,
IsPipe = 13
IsPipe = 13,
Offset = 14
};
enum class AttrField : uint8_t {
@@ -167,6 +169,116 @@ static const std::map<std::string,CodePropField> CodePropFieldMap =
{"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs},
{"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs}
};
// for Code Object V3
enum class KernelField : uint8_t {
SymbolName = 0,
ReqdWorkGroupSize = 1,
WorkGroupSizeHint = 2,
VecTypeHint = 3,
DeviceEnqueueSymbol = 4,
KernargSegmentSize = 5,
GroupSegmentFixedSize = 6,
PrivateSegmentFixedSize = 7,
KernargSegmentAlign = 8,
WavefrontSize = 9,
NumSGPRs = 10,
NumVGPRs = 11,
MaxFlatWorkGroupSize = 12,
NumSpilledSGPRs = 13,
NumSpilledVGPRs = 14
};
static const std::map<std::string,ArgField> ArgFieldMapV3 =
{
{".name", ArgField::Name},
{".type_name", ArgField::TypeName},
{".size", ArgField::Size},
{".offset", ArgField::Offset},
{".value_kind", ArgField::ValueKind},
{".vaule_type", ArgField::ValueType},
{".pointee_align", ArgField::PointeeAlign},
{".address_space", ArgField::AddrSpaceQual},
{".access", ArgField::AccQual},
{".actual_access", ArgField::ActualAccQual},
{".is_const", ArgField::IsConst},
{".is_restrict", ArgField::IsRestrict},
{".is_volatile", ArgField::IsVolatile},
{".is_pipe", ArgField::IsPipe}
};
static const std::map<std::string,ValueKind> ArgValueKindV3 =
{
{"by_value", ValueKind::ByValue},
{"global_buffer", ValueKind::GlobalBuffer},
{"dynamic_shared_pointer", ValueKind::DynamicSharedPointer},
{"sampler", ValueKind::Sampler},
{"image", ValueKind::Image},
{"pipe", ValueKind::Pipe},
{"queue", ValueKind::Queue},
{"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX},
{"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY},
{"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ},
{"hidden_none", ValueKind::HiddenNone},
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
{"hidden_completion_action", ValueKind::HiddenCompletionAction}
};
static const std::map<std::string,ValueType> ArgValueTypeV3 =
{
{"struct", ValueType::Struct},
{"i8", ValueType::I8},
{"u8", ValueType::U8},
{"i16", ValueType::I16},
{"u16", ValueType::U16},
{"f16", ValueType::F16},
{"i32", ValueType::I32},
{"u32", ValueType::U32},
{"f32", ValueType::F32},
{"i64", ValueType::I64},
{"u64", ValueType::U64},
{"f64", ValueType::F64}
};
static const std::map<std::string,AccessQualifier> ArgAccQualV3 =
{
{"default", AccessQualifier::Default},
{"read_only", AccessQualifier::ReadOnly},
{"write_only", AccessQualifier::WriteOnly},
{"read_write", AccessQualifier::ReadWrite}
};
static const std::map<std::string,AddressSpaceQualifier> ArgAddrSpaceQualV3 =
{
{"private", AddressSpaceQualifier::Private},
{"global", AddressSpaceQualifier::Global},
{"constant", AddressSpaceQualifier::Constant},
{"local", AddressSpaceQualifier::Local},
{"generic", AddressSpaceQualifier::Generic},
{"region", AddressSpaceQualifier::Region}
};
static const std::map<std::string,KernelField> KernelFieldMapV3 =
{
{".symbol", KernelField::SymbolName},
{".reqd_workgroup_size", KernelField::ReqdWorkGroupSize},
{".workgorup_size_hint", KernelField::WorkGroupSizeHint},
{".vec_type_hint", KernelField::VecTypeHint},
{".device_enqueue_symbol", KernelField::DeviceEnqueueSymbol},
{".kernarg_segment_size", KernelField::KernargSegmentSize},
{".group_segment_fixed_size", KernelField::GroupSegmentFixedSize},
{".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize},
{".kernarg_segment_align", KernelField::KernargSegmentAlign},
{".wavefront_size", KernelField::WavefrontSize},
{".sgpr_count", KernelField::NumSGPRs},
{".vgpr_count", KernelField::NumVGPRs},
{".max_flat_workgroup_size", KernelField::MaxFlatWorkGroupSize},
{".sgpr_spill_count", KernelField::NumSpilledSGPRs},
{".vgpr_spill_count", KernelField::NumSpilledVGPRs}
};
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -234,6 +346,8 @@ struct KernelParameterDescriptor {
namespace device {
class Program;
//! Printf info structure
struct PrintfInfo {
std::string fmtString_; //!< formated string for printf
@@ -272,7 +386,7 @@ class Kernel : public amd::HeapObject {
};
//! Default constructor
Kernel(const amd::Device& dev, const std::string& name);
Kernel(const amd::Device& dev, const std::string& name, const Program& prog);
//! Default destructor
virtual ~Kernel();
@@ -372,7 +486,7 @@ class Kernel : public amd::HeapObject {
//! Initializes the abstraction layer kernel parameters
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize);
void InitParameters(const amd_comgr_metadata_node_t kernelMD);
//! Get ther kernel metadata
bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD,
@@ -381,7 +495,6 @@ class Kernel : public amd::HeapObject {
//! Retrieve kernel attribute and code properties metadata
bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
const uint32_t kernargSegmentByteSize,
KernelMD* kernelMD);
//! Retrieve the available SGPRs and VGPRs
@@ -390,6 +503,12 @@ class Kernel : public amd::HeapObject {
//! Retrieve the printf string metadata
bool GetPrintfStr(const amd_comgr_metadata_node_t programMD,
std::vector<std::string>* printfStr);
//! Returns the kernel symbol name
const std::string& symbolName() const { return symbolName_; }
//! Returns the kernel code object version
const uint32_t codeObjectVer() const { return prog().codeObjectVer(); }
#else
void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
#endif
@@ -404,8 +523,13 @@ class Kernel : public amd::HeapObject {
//! Initializes HSAIL Printf metadata and info
void InitPrintf(const aclPrintfFmt* aclPrintf);
#endif
//! Returns program associated with this kernel
const Program& prog() const { return prog_; }
const amd::Device& dev_; //!< GPU device object
std::string name_; //!< kernel name
const Program& prog_; //!< Reference to the parent program
std::string symbolName_; //!< kernel symbol name
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
amd::KernelSignature* signature_; //!< kernel signature
std::string buildLog_; //!< build log
@@ -424,6 +548,7 @@ class Kernel : public amd::HeapObject {
Flags() : value_(0) {}
} flags_;
private:
//! Disable default copy constructor
Kernel(const Kernel&);
@@ -447,264 +572,5 @@ static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta,
return status;
}
static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
std::string buf;
// get the key of the argument field
size_t size = 0;
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itArgField = ArgFieldMap.find(buf);
if (itArgField == ArgFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
// get the value of the argument field
status = getMetaBuf(value, &buf);
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
switch (itArgField->second) {
case ArgField::Name:
lcArg->mName = buf;
break;
case ArgField::TypeName:
lcArg->mTypeName = buf;
break;
case ArgField::Size:
lcArg->mSize = atoi(buf.c_str());
break;
case ArgField::Align:
lcArg->mAlign = atoi(buf.c_str());
break;
case ArgField::ValueKind:
{
auto itValueKind = ArgValueKind.find(buf);
if (itValueKind == ArgValueKind.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueKind = itValueKind->second;
}
break;
case ArgField::ValueType:
{
auto itValueType = ArgValueType.find(buf);
if (itValueType == ArgValueType.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mValueType = itValueType->second;
}
break;
case ArgField::PointeeAlign:
lcArg->mPointeeAlign = atoi(buf.c_str());
break;
case ArgField::AddrSpaceQual:
{
auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
}
break;
case ArgField::AccQual:
{
auto itAccQual = ArgAccQual.find(buf);
if (itAccQual == ArgAccQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mAccQual = itAccQual->second;
}
break;
case ArgField::ActualAccQual:
{
auto itAccQual = ArgAccQual.find(buf);
if (itAccQual == ArgAccQual.end()) {
return AMD_COMGR_STATUS_ERROR;
}
lcArg->mActualAccQual = itAccQual->second;
}
break;
case ArgField::IsConst:
lcArg->mIsConst = (buf.compare("true") == 0);
break;
case ArgField::IsRestrict:
lcArg->mIsRestrict = (buf.compare("true") == 0);
break;
case ArgField::IsVolatile:
lcArg->mIsVolatile = (buf.compare("true") == 0);
break;
case ArgField::IsPipe:
lcArg->mIsPipe = (buf.compare("true") == 0);
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return AMD_COMGR_STATUS_SUCCESS;
}
static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
size_t size = 0;
std::string buf;
// get the key of the argument field
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itAttrField = AttrFieldMap.find(buf);
if (itAttrField == AttrFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
KernelMD* kernelMD = static_cast<KernelMD*>(data);
switch (itAttrField->second) {
case AttrField::ReqdWorkGroupSize:
{
status = amd::Comgr::get_metadata_list_size(value, &size);
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSize;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSize);
}
}
}
break;
case AttrField::WorkGroupSizeHint:
{
status = amd::Comgr::get_metadata_list_size(value, &size);
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
amd_comgr_metadata_node_t workgroupSizeHint;
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
if (status == AMD_COMGR_STATUS_SUCCESS &&
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
}
amd::Comgr::destroy_metadata(workgroupSizeHint);
}
}
}
break;
case AttrField::VecTypeHint:
{
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mVecTypeHint = buf;
}
}
break;
case AttrField::RuntimeHandle:
{
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
kernelMD->mAttrs.mRuntimeHandle = buf;
}
}
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return status;
}
static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
const amd_comgr_metadata_node_t value,
void *data) {
amd_comgr_status_t status;
amd_comgr_metadata_kind_t kind;
std::string buf;
// get the key of the argument field
status = amd::Comgr::get_metadata_kind(key, &kind);
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(key, &buf);
}
if (status != AMD_COMGR_STATUS_SUCCESS) {
return AMD_COMGR_STATUS_ERROR;
}
auto itCodePropField = CodePropFieldMap.find(buf);
if (itCodePropField == CodePropFieldMap.end()) {
return AMD_COMGR_STATUS_ERROR;
}
// get the value of the argument field
if (status == AMD_COMGR_STATUS_SUCCESS) {
status = getMetaBuf(value, &buf);
}
KernelMD* kernelMD = static_cast<KernelMD*>(data);
switch (itCodePropField->second) {
case CodePropField::KernargSegmentSize:
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
break;
case CodePropField::GroupSegmentFixedSize:
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
break;
case CodePropField::PrivateSegmentFixedSize:
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
break;
case CodePropField::KernargSegmentAlign:
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
break;
case CodePropField::WavefrontSize:
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
break;
case CodePropField::NumSGPRs:
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
break;
case CodePropField::NumVGPRs:
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
break;
case CodePropField::MaxFlatWorkGroupSize:
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
break;
case CodePropField::IsDynamicCallStack:
kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
break;
case CodePropField::IsXNACKEnabled:
kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
break;
case CodePropField::NumSpilledSGPRs:
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
break;
case CodePropField::NumSpilledVGPRs:
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
break;
default:
return AMD_COMGR_STATUS_ERROR;
}
return AMD_COMGR_STATUS_SUCCESS;
}
#endif
#endif // defined(USE_COMGR_LIBRARY)
} // namespace device
+19 -11
파일 보기
@@ -706,8 +706,6 @@ bool Program::compileImplLC(const std::string& sourceCode,
driverOptions.append(options->llvmOptions);
driverOptions.append(ProcessOptions(options));
// Force object code v2.
driverOptions.append(" -mno-code-object-v3");
// Set whole program mode
driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink");
@@ -1532,8 +1530,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
codegenOptions.append(" -mno-sram-ecc");
}
// Force object code v2.
codegenOptions.append(" -mno-code-object-v3");
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
@@ -1764,8 +1760,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
std::ostream_iterator<std::string>(ostrstr, " "));
codegenOptions.append(" ").append(ostrstr.str());
// Force object code v2.
codegenOptions.append(" -mno-code-object-v3");
// Set whole program mode
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
@@ -2880,8 +2874,19 @@ bool Program::createKernelMetadataMap() {
status = amd::Comgr::metadata_lookup(*metadata_, "Kernels", &kernelsMD);
if (status == AMD_COMGR_STATUS_SUCCESS) {
LogInfo("Using Code Object V2.");
hasKernelMD = true;
status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
codeObjectVer_ = 2;
}
else {
status = amd::Comgr::metadata_lookup(*metadata_, "amdhsa.kernels", &kernelsMD);
if (status == AMD_COMGR_STATUS_SUCCESS) {
LogInfo("Using Code Object V3.");
hasKernelMD = true;
codeObjectVer_ = 3;
status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
}
}
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
@@ -2896,7 +2901,9 @@ bool Program::createKernelMetadataMap() {
if (status == AMD_COMGR_STATUS_SUCCESS) {
hasKernelNode = true;
status = amd::Comgr::metadata_lookup(kernelNode, "Name", &nameMeta);
status = amd::Comgr::metadata_lookup(kernelNode,
(codeObjectVer() == 2) ? "Name" : ".name",
&nameMeta);
}
if (status == AMD_COMGR_STATUS_SUCCESS) {
@@ -2970,9 +2977,10 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) {
buildLog_ += "Error: object code with old metadata is not supported\n";
return false;
}
else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
note->n_namesz == sizeof "AMD" &&
!memcmp(name, "AMD", note->n_namesz)) {
else if ((note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) ||
(note->n_type == 32 /* NT_AMD_AMDGPU_HSA_METADATA V3 */ &&
note->n_namesz == sizeof "AMDGPU" && !memcmp(name, "AMDGPU", note->n_namesz))) {
#if defined(USE_COMGR_LIBRARY)
amd_comgr_status_t status;
amd_comgr_data_t binaryData;
+3
파일 보기
@@ -118,6 +118,7 @@ class Program : public amd::HeapObject {
#if defined(USE_COMGR_LIBRARY)
amd_comgr_metadata_node_t* metadata_; //!< COMgr metadata
uint32_t codeObjectVer_; //!< version of code object
std::map<std::string,amd_comgr_metadata_node_t> kernelMetadataMap_; //!< Map of kernel metadata
#else
CodeObjectMD* metadata_; //!< Runtime metadata
@@ -211,6 +212,8 @@ class Program : public amd::HeapObject {
auto it = kernelMetadataMap_.find(name);
return (it == kernelMetadataMap_.end()) ? nullptr : &(it->second);
}
const uint32_t codeObjectVer() const { return codeObjectVer_; }
#else
const CodeObjectMD* metadata() const { return metadata_; }
#endif
+2 -4
파일 보기
@@ -515,10 +515,9 @@ clk_value_type_t KernelArg::type() const {
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
const NullProgram& nullprog)
: device::Kernel(gpuNullDev, name),
: device::Kernel(gpuNullDev, name, nullprog),
buildError_(CL_BUILD_PROGRAM_FAILURE),
gpuDev_(gpuNullDev),
prog_(nullprog),
calRef_(NULL),
internal_(false),
flags_(0),
@@ -3027,9 +3026,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
uint extraArgsNum)
: device::Kernel(prog->dev(), name),
: device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
prog_(*prog),
index_(0),
code_(NULL),
codeSize_(0),
+1 -3
파일 보기
@@ -406,7 +406,7 @@ class NullKernel : public device::Kernel {
const NullDevice& nullDev() const { return gpuDev_; }
//! Returns GPU device object, associated with this kernel
const NullProgram& nullProg() const { return prog_; }
const NullProgram& nullProg() const { return reinterpret_cast<const NullProgram&>(prog_); }
//! Returns the kernel's build error
const cl_int buildError() const { return buildError_; }
@@ -455,7 +455,6 @@ class NullKernel : public device::Kernel {
std::string ilSource_; //!< IL source code of this kernel
const NullDevice& gpuDev_; //!< GPU device object
const NullProgram& prog_; //!< Reference to the parent program
CalImageReference* calRef_; //!< CAL image reference for this kernel
bool internal_; //!< Runtime internal ker
@@ -847,7 +846,6 @@ class HSAILKernel : public device::Kernel {
std::vector<Argument*> arguments_; //!< Vector list of HSAIL Arguments
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
gpu::Memory* code_; //!< Memory object with ISA code
+99 -45
파일 보기
@@ -26,37 +26,17 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
namespace pal {
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
if (!sym) {
return false;
}
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
return false;
}
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
reinterpret_cast<void*>(&codeSize_))) {
return false;
}
amd_kernel_code_t* akc = &akc_;
// Copy codeobject of this kernel from the program CPU segment
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
size_t akc_align = 0;
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
reinterpret_cast<void*>(&akc_align))) {
return false;
}
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
const uint32_t groupSegmentSize,
const uint16_t numSGPRs,
const uint16_t numVGPRs) {
workGroupInfo_.scratchRegs_ =
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
akc->workgroup_group_segment_byte_size;
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
workGroupInfo_.privateMemSize_ = privateSegmentSize;
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
workGroupInfo_.usedSGPRs_ = numSGPRs;
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
workGroupInfo_.usedVGPRs_ = numVGPRs;
if (!prog().isNull()) {
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
@@ -72,16 +52,57 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
workGroupInfo_.availableVGPRs_ = 256;
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
}
}
bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
if (!sym) {
return false;
}
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
return false;
}
// Copy code object of this kernel from the program CPU segment
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
return true;
}
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
amd_kernel_code_t* akc = &akc_;
if (!setKernelCode(sym, akc)) {
return false;
}
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
reinterpret_cast<void*>(&codeSize_))) {
return false;
}
// Setup the the workgroup info
setWorkGroupInfo(akc->workitem_private_segment_byte_size,
akc->workgroup_group_segment_byte_size,
akc->wavefront_sgpr_count,
akc->workitem_vgpr_count);
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
return true;
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
: device::Kernel(prog->dev(), name),
: device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
prog_(*prog),
index_(0),
code_(0),
codeSize_(0)
codeSize_(0),
workgroupGroupSegmentByteSize_(0),
kernargSegmentByteSize_(0),
spillSegmentByteSize_(0)
{
flags_.hsa_ = true;
}
@@ -388,37 +409,54 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
aqlCreateHWInfo(symbol);
#if defined(USE_COMGR_LIBRARY)
const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
if (kernelMetaNode == nullptr) {
return false;
}
KernelMD kernelMD;
if (!GetAttrCodePropMetadata(*kernelMetaNode, argsBufferSize(), &kernelMD)) {
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
sizeof(uint32_t));
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
// Copy codeobject of this kernel from the program CPU segment
hsa_agent_t agent;
agent.handle = 1;
auto sym = prog().GetSymbol(symbolName().c_str(), const_cast<hsa_agent_t*>(&agent));
if (!setKernelCode(sym, &akc_)) {
return false;
}
codeSize_ = prog().codeSegGpu().owner()->getSize();
// handle device enqueue
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
hsa_agent_t agent;
agent.handle = 1;
amd::hsa::loader::Symbol* rth_symbol;
// Get the runtime handle symbol GPU address
rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
const_cast<hsa_agent_t*>(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
// Copy the kernel_object pointer to the runtime handle symbol GPU address
const Memory& codeSegGpu = prog_.codeSegGpu();
const Memory& codeSegGpu = prog().codeSegGpu();
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
uint64_t kernel_object = gpuAqlCode();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
@@ -432,6 +470,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
}
// Setup the the workgroup info
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
kernelMD.mCodeProps.mGroupSegmentFixedSize,
kernelMD.mCodeProps.mNumSGPRs,
kernelMD.mCodeProps.mNumVGPRs);
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
@@ -452,7 +496,18 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
if (!printfStr.empty()) {
InitPrintf(printfStr);
}
#else
return true;
}
#endif // defined(USE_COMGR_LIBRARY)
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
aqlCreateHWInfo(symbol);
const CodeObjectMD* programMD = prog().metadata();
assert(programMD != nullptr);
@@ -489,13 +544,13 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
amd::hsa::loader::Symbol* rth_symbol;
// Get the runtime handle symbol GPU address
rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
const_cast<hsa_agent_t*>(&agent));
uint64_t symbol_address;
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
// Copy the kernel_object pointer to the runtime handle symbol GPU address
const Memory& codeSegGpu = prog_.codeSegGpu();
const Memory& codeSegGpu = prog().codeSegGpu();
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
@@ -529,8 +584,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
waveLimiter_.enable();
*/
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
return true;
}
+21 -4
파일 보기
@@ -59,7 +59,7 @@ class HSAILKernel : public device::Kernel {
const HSAILProgram& prog() const;
//! Returns LDS size used in this kernel
uint32_t ldsSize() const { return akc_.workgroup_group_segment_byte_size; }
uint32_t ldsSize() const { return workgroupGroupSegmentByteSize_; }
//! Returns pointer on CPU to AQL code info
const amd_kernel_code_t* cpuAqlCode() const { return &akc_; }
@@ -71,10 +71,10 @@ class HSAILKernel : public device::Kernel {
size_t aqlCodeSize() const { return codeSize_; }
//! Returns the size of argument buffer
size_t argsBufferSize() const { return akc_.kernarg_segment_byte_size; }
size_t argsBufferSize() const { return kernargSegmentByteSize_; }
//! Returns spill reg size per workitem
uint32_t spillSegSize() const { return amd::alignUp(akc_.workitem_private_segment_byte_size, sizeof(uint32_t)); }
uint32_t spillSegSize() const { return spillSegmentByteSize_; }
//! Returns AQL packet in CPU memory
//! if the kernel arguments were successfully loaded, otherwise NULL
@@ -102,13 +102,25 @@ class HSAILKernel : public device::Kernel {
//! Creates AQL kernel HW info
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
//! Get the kernel code and copy the code object from the program CPU segment
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
//! Set up the workgroup info based on the kernel metadata
void setWorkGroupInfo(const uint32_t privateSegmentSize,
const uint32_t groupSegmentSize,
const uint16_t numSGPRs,
const uint16_t numVGPRs);
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
const HSAILProgram& prog_; //!< Reference to the parent program
uint index_; //!< Kernel index in the program
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
};
class LightningKernel : public HSAILKernel {
@@ -121,6 +133,11 @@ class LightningKernel : public HSAILKernel {
//! Initializes the metadata required for this kernel,
bool init(amd::hsa::loader::Symbol* symbol);
#if defined(USE_COMGR_LIBRARY)
//! Initializes the metadata required for this kernel,
bool init();
#endif
};
/*@}*/} // namespace pal
+20 -1
파일 보기
@@ -747,6 +747,25 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
return false;
}
#if defined(USE_COMGR_LIBRARY)
for (const auto &kernelMeta : kernelMetadataMap_) {
auto kernelName = kernelMeta.first;
auto kernel = new LightningKernel(kernelName, this,
options->origOptionStr + ProcessOptions(options));
kernels()[kernelName] = kernel;
if (!kernel->init()) {
return false;
}
kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
maxScratchRegs_ =
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
#else
// Get the list of kernels
std::vector<std::string> kernelNameList;
status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList);
@@ -781,7 +800,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
maxScratchRegs_ =
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
#endif // defined(USE_COMGR_LIBRARY)
DestroySegmentCpuAccess();
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
+19 -11
파일 보기
@@ -166,7 +166,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
}
// Update GPU specific settings and info structure if we have any
#if defined(_WIN32)
ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
#else
ModifyMaxWorkload modifyMaxWorkload = {0};
#endif
// APU systems
if (palProp.gpuType == Pal::GpuType::Integrated) {
@@ -326,18 +330,22 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
return false;
}
#if defined(_WIN32)
if (modifyMaxWorkload.time > 0) {
OSVERSIONINFOEX versionInfo = {0};
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
versionInfo.dwMajorVersion = 6;
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
splitSizeForWin7_ = false;
DWORDLONG conditionMask = 0;
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
maxWorkloadTime_ = modifyMaxWorkload.time;
#if defined(_WIN32)
OSVERSIONINFOEX versionInfo = {0};
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
versionInfo.dwMajorVersion = 6;
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
DWORDLONG conditionMask = 0;
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
if (modifyMaxWorkload.time > 0) {
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
}
}
#endif // defined(_WIN32)
+2 -1
파일 보기
@@ -60,7 +60,8 @@ class Settings : public device::Settings {
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
uint rgpSqttForceDisable_: 1; //!< Disables SQTT
uint reserved_ : 12;
uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
uint reserved_ : 11;
};
uint value_;
};
+7
파일 보기
@@ -553,6 +553,11 @@ void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
uint instructions) {
if (!dev.settings().splitSizeForWin7_) {
dispatchSplitSize_ = 0;
return;
}
uint64_t workload = threads * instructions;
if (maxDispatchWorkload_ < workload) {
dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
@@ -2258,6 +2263,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
// Run AQL dispatch in HW
eventBegin(MainEngine);
iCmd()->CmdDispatchAql(dispatchParam);
+63 -38
파일 보기
@@ -25,41 +25,58 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
: device::Kernel(prog->dev(), name),
program_(prog),
: device::Kernel(prog->dev(), name, *prog),
kernelCodeHandle_(kernelCodeHandle),
workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize),
kernargSegmentByteSize_(kernargSegmentByteSize),
kernargSegmentAlignment_(kernargSegmentAlignment) {}
Kernel::Kernel(std::string name, Program* prog)
: device::Kernel(prog->dev(), name, *prog),
kernelCodeHandle_(0),
workgroupGroupSegmentByteSize_(0),
workitemPrivateSegmentByteSize_(0),
kernargSegmentByteSize_(0),
kernargSegmentAlignment_(0) {}
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
hsa_agent_t hsaDevice = program_->hsaDevice();
hsa_agent_t hsaDevice = program()->hsaDevice();
const amd_comgr_metadata_node_t* kernelMetaNode =
static_cast<LightningProgram*>(program_)->getKernelMetadata(name());
static_cast<const LightningProgram*>(program())->getKernelMetadata(name());
if (kernelMetaNode == nullptr) {
return false;
}
KernelMD kernelMD;
if (!GetAttrCodePropMetadata(*kernelMetaNode, KernargSegmentByteSize(), &kernelMD)) {
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
return false;
}
// Set the kernel symbol name and size/alignment based on the kernel metadata
// NOTE: kernel name is used to get the kernel code handle in V2,
// but kernel symbol name is used in V3
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
workitemPrivateSegmentByteSize_ = kernelMD.mCodeProps.mPrivateSegmentFixedSize;
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
kernargSegmentAlignment_ = amd::alignUp(std::max(kernelMD.mCodeProps.mKernargSegmentAlign, 128u),
dev().info().globalMemCacheLineSize_);
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
// Get the available SGPRs and VGPRs
std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget();
if (program_->xnackEnable()) {
std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program()->machineTarget();
if (program()->xnackEnable()) {
targetIdent.append("+xnack");
}
if (program_->sramEccEnable()) {
if (program()->sramEccEnable()) {
targetIdent.append("+sram-ecc");
}
@@ -67,10 +84,23 @@ bool LightningKernel::init() {
return false;
}
// Get the kernel code handle
hsa_status_t hsaStatus;
hsa_executable_symbol_t symbol;
hsa_agent_t agent = program()->hsaDevice();
hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
symbolName().c_str(),
&agent, &symbol);
if (hsaStatus == HSA_STATUS_SUCCESS) {
hsaStatus = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&kernelCodeHandle_);
}
if (hsaStatus != HSA_STATUS_SUCCESS) {
return false;
}
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
hsa_agent_t agent = program_->hsaDevice();
hsa_executable_symbol_t kernelSymbol;
hsa_status_t hsaStatus;
int variable_size;
uint64_t variable_address;
@@ -79,7 +109,7 @@ bool LightningKernel::init() {
// object handle of such a kernel. The address of the variable and the kernel code object handle are known
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
// address of the variable.
hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(),
hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
kernelMD.mAttrs.mRuntimeHandle.c_str(),
&agent, &kernelSymbol);
if (hsaStatus == HSA_STATUS_SUCCESS) {
@@ -87,11 +117,6 @@ bool LightningKernel::init() {
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
&variable_size);
}
if (hsaStatus == HSA_STATUS_SUCCESS) {
hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
&variable_size);
}
if (hsaStatus == HSA_STATUS_SUCCESS) {
hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
@@ -114,7 +139,7 @@ bool LightningKernel::init() {
}
uint32_t wavefront_size = 0;
if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
HSA_STATUS_SUCCESS) {
return false;
}
@@ -127,7 +152,7 @@ bool LightningKernel::init() {
workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs;
workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs;
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
if (workGroupInfo_.size_ == 0) {
@@ -135,7 +160,7 @@ bool LightningKernel::init() {
}
// handle the printf metadata if any
const amd_comgr_metadata_node_t* programMD = static_cast<LightningProgram*>(program_)->metadata();
const amd_comgr_metadata_node_t* programMD = static_cast<const LightningProgram*>(program())->metadata();
assert(programMD != nullptr);
std::vector<std::string> printfStr;
@@ -159,10 +184,10 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
}
bool LightningKernel::init() {
hsa_agent_t hsaDevice = program_->hsaDevice();
hsa_agent_t hsaDevice = program()->hsaDevice();
// Pull out metadata from the ELF
const CodeObjectMD* programMD = static_cast<LightningProgram*>(program_)->metadata();
const CodeObjectMD* programMD = static_cast<const LightningProgram*>(program())->metadata();
assert(programMD != nullptr);
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
@@ -172,7 +197,7 @@ bool LightningKernel::init() {
InitParameters(*kernelMD, KernargSegmentByteSize());
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
workGroupInfo_.availableSGPRs_ = 104;
workGroupInfo_.availableVGPRs_ = 256;
@@ -196,7 +221,7 @@ bool LightningKernel::init() {
}
if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
hsa_agent_t agent = program_->hsaDevice();
hsa_agent_t agent = program()->hsaDevice();
hsa_executable_symbol_t kernelSymbol;
hsa_status_t status;
int variable_size;
@@ -208,7 +233,7 @@ bool LightningKernel::init() {
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
// address of the variable.
status = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
status = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
&agent, &kernelSymbol);
if (status != HSA_STATUS_SUCCESS) {
return false;
@@ -239,7 +264,7 @@ bool LightningKernel::init() {
}
uint32_t wavefront_size = 0;
if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
HSA_STATUS_SUCCESS) {
return false;
}
@@ -258,7 +283,7 @@ bool LightningKernel::init() {
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
@@ -278,18 +303,18 @@ bool LightningKernel::init() {
bool HSAILKernel::init() {
acl_error errorCode;
// compile kernel down to ISA
hsa_agent_t hsaDevice = program_->hsaDevice();
hsa_agent_t hsaDevice = program()->hsaDevice();
// Pull out metadata from the ELF
size_t sizeOfArgList;
aclCompiler* compileHandle = program_->dev().compiler();
aclCompiler* compileHandle = program()->dev().compiler();
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfArgList);
if (errorCode != ACL_SUCCESS) {
return false;
}
std::unique_ptr<char[]> argList(new char[sizeOfArgList]);
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), argList.get(), &sizeOfArgList);
if (errorCode != ACL_SUCCESS) {
return false;
@@ -300,17 +325,17 @@ bool HSAILKernel::init() {
// Set the workgroup information for the kernel
memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
assert(workGroupInfo_.availableLDSSize_ > 0);
workGroupInfo_.availableSGPRs_ = 104;
workGroupInfo_.availableVGPRs_ = 256;
size_t sizeOfWorkGroupSize;
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
if (errorCode != ACL_SUCCESS) {
return false;
}
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), workGroupInfo_.compileSize_,
&sizeOfWorkGroupSize);
if (errorCode != ACL_SUCCESS) {
@@ -319,7 +344,7 @@ bool HSAILKernel::init() {
uint32_t wavefront_size = 0;
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
return false;
}
assert(wavefront_size > 0);
@@ -344,18 +369,18 @@ bool HSAILKernel::init() {
}
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
workGroupInfo_.wavefrontSize_ = wavefront_size;
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
} else {
workGroupInfo_.size_ = program_->dev().info().preferredWorkGroupSize_;
workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
if (errorCode != ACL_SUCCESS) {
return false;
@@ -367,7 +392,7 @@ bool HSAILKernel::init() {
if (!aclPrintfList) {
return false;
}
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(),
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(),
aclPrintfList.get(), &sizeOfPrintfList);
if (errorCode != ACL_SUCCESS) {
+12 -6
파일 보기
@@ -22,6 +22,8 @@ class Kernel : public device::Kernel {
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment);
Kernel(std::string name, Program* prog);
const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; }
const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; }
@@ -37,15 +39,15 @@ class Kernel : public device::Kernel {
//! Initializes the metadata required for this kernel
virtual bool init() = 0;
const Program* program() const { return static_cast<const Program*>(program_); }
const Program* program() const { return static_cast<const Program*>(&prog_); }
protected:
Program* program_; //!< The roc::Program context
// Program* program_; //!< The roc::Program context
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
const uint32_t workgroupGroupSegmentByteSize_;
const uint32_t workitemPrivateSegmentByteSize_;
const uint32_t kernargSegmentByteSize_;
const uint32_t kernargSegmentAlignment_;
uint32_t workgroupGroupSegmentByteSize_;
uint32_t workitemPrivateSegmentByteSize_;
uint32_t kernargSegmentByteSize_;
uint32_t kernargSegmentAlignment_;
size_t kernelDirectiveOffset_;
};
@@ -74,6 +76,10 @@ class LightningKernel : public roc::Kernel {
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
}
LightningKernel(std::string name, Program* prog)
: roc::Kernel(name, prog) {}
//! Initializes the metadata required for this kernel
virtual bool init() final;
};
+14
파일 보기
@@ -488,6 +488,19 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
return false;
}
#if defined(USE_COMGR_LIBRARY)
for (const auto &kernelMeta : kernelMetadataMap_) {
const std::string kernelName = kernelMeta.first;
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
if (!aKernel->init()) {
return false;
}
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") !=
std::string::npos);
kernels()[kernelName] = aKernel;
}
#else
// Get the list of kernels
std::vector<std::string> kernelNameList;
status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback,
@@ -582,6 +595,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
std::string::npos);
kernels()[kernelName] = aKernel;
}
#endif // defined(USE_COMGR_LIBRARY)
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
return true;
}