P4 to Git Change 1767752 by wchau@wchau_OCL_Linux on 2019/04/09 22:58:03
SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata
- Add support for the V3 code objects
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#336 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#134 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/inc/core/palCmdBuffer.h#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx6/gfx6ComputeCmdBuffer.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx9/gfx9ComputeCmdBuffer.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#77 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#130 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#103 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#47 edit
[ROCm/clr commit: 36a5f2a85f]
이 커밋은 다음에 포함됨:
@@ -29,10 +29,487 @@ using llvm::AMDGPU::HSAMD::ValueType;
|
||||
|
||||
namespace device {
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
size_t size = 0;
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itArgField = ArgFieldMap.find(buf);
|
||||
if (itArgField == ArgFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// get the value of the argument field
|
||||
status = getMetaBuf(value, &buf);
|
||||
|
||||
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
|
||||
|
||||
switch (itArgField->second) {
|
||||
case ArgField::Name:
|
||||
lcArg->mName = buf;
|
||||
break;
|
||||
case ArgField::TypeName:
|
||||
lcArg->mTypeName = buf;
|
||||
break;
|
||||
case ArgField::Size:
|
||||
lcArg->mSize = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::Align:
|
||||
lcArg->mAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::ValueKind:
|
||||
{
|
||||
auto itValueKind = ArgValueKind.find(buf);
|
||||
if (itValueKind == ArgValueKind.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueKind = itValueKind->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ValueType:
|
||||
{
|
||||
auto itValueType = ArgValueType.find(buf);
|
||||
if (itValueType == ArgValueType.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueType = itValueType->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::PointeeAlign:
|
||||
lcArg->mPointeeAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::AddrSpaceQual:
|
||||
{
|
||||
auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
|
||||
if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::AccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQual.find(buf);
|
||||
if (itAccQual == ArgAccQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ActualAccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQual.find(buf);
|
||||
if (itAccQual == ArgAccQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mActualAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::IsConst:
|
||||
lcArg->mIsConst = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsRestrict:
|
||||
lcArg->mIsRestrict = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsVolatile:
|
||||
lcArg->mIsVolatile = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsPipe:
|
||||
lcArg->mIsPipe = (buf.compare("true") == 0);
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
size_t size = 0;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itAttrField = AttrFieldMap.find(buf);
|
||||
if (itAttrField == AttrFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
KernelMD* kernelMD = static_cast<KernelMD*>(data);
|
||||
switch (itAttrField->second) {
|
||||
case AttrField::ReqdWorkGroupSize:
|
||||
{
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSize;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::WorkGroupSizeHint:
|
||||
{
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSizeHint;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSizeHint);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::VecTypeHint:
|
||||
{
|
||||
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mVecTypeHint = buf;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::RuntimeHandle:
|
||||
{
|
||||
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mRuntimeHandle = buf;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itCodePropField = CodePropFieldMap.find(buf);
|
||||
if (itCodePropField == CodePropFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// get the value of the argument field
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(value, &buf);
|
||||
}
|
||||
|
||||
KernelMD* kernelMD = static_cast<KernelMD*>(data);
|
||||
switch (itCodePropField->second) {
|
||||
case CodePropField::KernargSegmentSize:
|
||||
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::GroupSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::PrivateSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::KernargSegmentAlign:
|
||||
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::WavefrontSize:
|
||||
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumSGPRs:
|
||||
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumVGPRs:
|
||||
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::MaxFlatWorkGroupSize:
|
||||
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::IsDynamicCallStack:
|
||||
kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
|
||||
break;
|
||||
case CodePropField::IsXNACKEnabled:
|
||||
kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
|
||||
break;
|
||||
case CodePropField::NumSpilledSGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumSpilledVGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
size_t size = 0;
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itArgField = ArgFieldMapV3.find(buf);
|
||||
if (itArgField == ArgFieldMapV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// get the value of the argument field
|
||||
status = getMetaBuf(value, &buf);
|
||||
|
||||
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
|
||||
|
||||
switch (itArgField->second) {
|
||||
case ArgField::Name:
|
||||
lcArg->mName = buf;
|
||||
break;
|
||||
case ArgField::TypeName:
|
||||
lcArg->mTypeName = buf;
|
||||
break;
|
||||
case ArgField::Size:
|
||||
lcArg->mSize = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::Offset:
|
||||
lcArg->mOffset = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::ValueKind:
|
||||
{
|
||||
auto itValueKind = ArgValueKindV3.find(buf);
|
||||
if (itValueKind == ArgValueKindV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueKind = itValueKind->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ValueType:
|
||||
{
|
||||
auto itValueType = ArgValueTypeV3.find(buf);
|
||||
if (itValueType == ArgValueTypeV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueType = itValueType->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::PointeeAlign:
|
||||
lcArg->mPointeeAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::AddrSpaceQual:
|
||||
{
|
||||
auto itAddrSpaceQual = ArgAddrSpaceQualV3.find(buf);
|
||||
if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::AccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQualV3.find(buf);
|
||||
if (itAccQual == ArgAccQualV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ActualAccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQualV3.find(buf);
|
||||
if (itAccQual == ArgAccQualV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mActualAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::IsConst:
|
||||
lcArg->mIsConst = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsRestrict:
|
||||
lcArg->mIsRestrict = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsVolatile:
|
||||
lcArg->mIsVolatile = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsPipe:
|
||||
lcArg->mIsPipe = (buf.compare("true") == 0);
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateKernelMetaV3(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
size_t size = 0;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itKernelField = KernelFieldMapV3.find(buf);
|
||||
if (itKernelField == KernelFieldMapV3.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
if (itKernelField->second != KernelField::ReqdWorkGroupSize &&
|
||||
itKernelField->second != KernelField::WorkGroupSizeHint) {
|
||||
status = getMetaBuf(value,&buf);
|
||||
}
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
KernelMD* kernelMD = static_cast<KernelMD*>(data);
|
||||
switch (itKernelField->second) {
|
||||
case KernelField::ReqdWorkGroupSize:
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSize;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case KernelField::WorkGroupSizeHint:
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSizeHint;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSizeHint);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case KernelField::VecTypeHint:
|
||||
kernelMD->mAttrs.mVecTypeHint = buf;
|
||||
break;
|
||||
case KernelField::DeviceEnqueueSymbol:
|
||||
kernelMD->mAttrs.mRuntimeHandle = buf;
|
||||
break;
|
||||
case KernelField::KernargSegmentSize:
|
||||
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::GroupSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::PrivateSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::KernargSegmentAlign:
|
||||
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::WavefrontSize:
|
||||
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::NumSGPRs:
|
||||
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::NumVGPRs:
|
||||
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::MaxFlatWorkGroupSize:
|
||||
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::NumSpilledSGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::NumSpilledVGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case KernelField::SymbolName:
|
||||
kernelMD->mSymbolName = buf;
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
#endif
|
||||
|
||||
// ================================================================================================
|
||||
Kernel::Kernel(const amd::Device& dev, const std::string& name)
|
||||
Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& prog)
|
||||
: dev_(dev)
|
||||
, name_(name)
|
||||
, prog_(prog)
|
||||
, signature_(nullptr)
|
||||
, waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
|
||||
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
|
||||
@@ -482,6 +959,8 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s
|
||||
|
||||
// ================================================================================================
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; }
|
||||
|
||||
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
||||
#endif
|
||||
|
||||
@@ -771,11 +1250,10 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
|
||||
// ================================================================================================
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
|
||||
const uint32_t kernargSegmentByteSize,
|
||||
KernelMD* kernelMD) {
|
||||
bool Kernel::GetAttrCodePropMetadata( const amd_comgr_metadata_node_t kernelMetaNode,
|
||||
KernelMD* kernelMD) {
|
||||
|
||||
InitParameters(kernelMetaNode, kernargSegmentByteSize);
|
||||
InitParameters(kernelMetaNode);
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
|
||||
@@ -784,25 +1262,49 @@ bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaN
|
||||
workGroupInfo_.availableVGPRs_ = 256;
|
||||
|
||||
// extract the attribute metadata if there is any
|
||||
amd_comgr_metadata_node_t attrMeta;
|
||||
amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS;
|
||||
if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::iterate_map_metadata(attrMeta, device::populateAttrs,
|
||||
static_cast<void*>(kernelMD));
|
||||
amd::Comgr::destroy_metadata(attrMeta);
|
||||
|
||||
switch (codeObjectVer()) {
|
||||
case 2: {
|
||||
amd_comgr_metadata_node_t symbolName;
|
||||
status = amd::Comgr::metadata_lookup(kernelMetaNode, "SymbolName", &symbolName);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(symbolName, &(kernelMD->mSymbolName));
|
||||
amd::Comgr::destroy_metadata(symbolName);
|
||||
}
|
||||
|
||||
amd_comgr_metadata_node_t attrMeta;
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) ==
|
||||
AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::iterate_map_metadata(attrMeta, populateAttrs,
|
||||
static_cast<void*>(kernelMD));
|
||||
amd::Comgr::destroy_metadata(attrMeta);
|
||||
}
|
||||
}
|
||||
|
||||
// extract the code properties metadata
|
||||
amd_comgr_metadata_node_t codePropsMeta;
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
|
||||
}
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::iterate_map_metadata(codePropsMeta, populateCodeProps,
|
||||
static_cast<void*>(kernelMD));
|
||||
amd::Comgr::destroy_metadata(codePropsMeta);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3: {
|
||||
status = amd::Comgr::iterate_map_metadata(kernelMetaNode, populateKernelMetaV3,
|
||||
static_cast<void*>(kernelMD));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
// extract the code properties metadata
|
||||
amd_comgr_metadata_node_t codePropsMeta;
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
|
||||
}
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::iterate_map_metadata(codePropsMeta, device::populateCodeProps,
|
||||
static_cast<void*>(kernelMD));
|
||||
amd::Comgr::destroy_metadata(codePropsMeta);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return false;
|
||||
@@ -917,19 +1419,21 @@ bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD,
|
||||
return (status == AMD_COMGR_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) {
|
||||
void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) {
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argBufferSize;
|
||||
|
||||
amd_comgr_metadata_node_t argsMeta;
|
||||
bool hsaArgsMeta = false;
|
||||
size_t argsSize;
|
||||
|
||||
amd_comgr_status_t status = amd::Comgr::metadata_lookup(kernelMD, "Args", &argsMeta);
|
||||
amd_comgr_status_t status = amd::Comgr::metadata_lookup(
|
||||
kernelMD,
|
||||
(codeObjectVer() == 2) ? "Args" : ".args",
|
||||
&argsMeta);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
hsaArgsMeta = true;
|
||||
status = amd::Comgr::get_metadata_list_size(argsMeta, &argsSize);
|
||||
@@ -940,7 +1444,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < argsSize; ++i) {
|
||||
KernelArgMD lcArg;
|
||||
KernelArgMD lcArg;
|
||||
|
||||
amd_comgr_metadata_node_t argsNode;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
@@ -956,7 +1460,13 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
|
||||
status = AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, static_cast<void*>(&lcArg));
|
||||
void *data = static_cast<void*>(&lcArg);
|
||||
if (codeObjectVer() == 2) {
|
||||
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data);
|
||||
}
|
||||
else if (codeObjectVer() == 3) {
|
||||
status = amd::Comgr::iterate_map_metadata(argsNode, populateArgsV3, data);
|
||||
}
|
||||
}
|
||||
|
||||
if (hsaArgsNode) {
|
||||
@@ -971,7 +1481,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
|
||||
}
|
||||
|
||||
size_t size = GetArgSizeOCL(lcArg);
|
||||
size_t alignment = GetArgAlignmentOCL(lcArg);
|
||||
size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0;
|
||||
bool isHidden = false;
|
||||
desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
|
||||
|
||||
@@ -980,7 +1490,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
|
||||
setDynamicParallelFlag(true);
|
||||
}
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = size;
|
||||
offset += size;
|
||||
@@ -1007,7 +1517,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
else {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
||||
using llvm::AMDGPU::HSAMD::ValueKind;
|
||||
using llvm::AMDGPU::HSAMD::ValueType;
|
||||
|
||||
// for Code Object V3
|
||||
enum class ArgField : uint8_t {
|
||||
Name = 0,
|
||||
TypeName = 1,
|
||||
@@ -48,7 +49,8 @@ enum class ArgField : uint8_t {
|
||||
IsConst = 10,
|
||||
IsRestrict = 11,
|
||||
IsVolatile = 12,
|
||||
IsPipe = 13
|
||||
IsPipe = 13,
|
||||
Offset = 14
|
||||
};
|
||||
|
||||
enum class AttrField : uint8_t {
|
||||
@@ -167,6 +169,116 @@ static const std::map<std::string,CodePropField> CodePropFieldMap =
|
||||
{"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs},
|
||||
{"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs}
|
||||
};
|
||||
|
||||
// for Code Object V3
|
||||
enum class KernelField : uint8_t {
|
||||
SymbolName = 0,
|
||||
ReqdWorkGroupSize = 1,
|
||||
WorkGroupSizeHint = 2,
|
||||
VecTypeHint = 3,
|
||||
DeviceEnqueueSymbol = 4,
|
||||
KernargSegmentSize = 5,
|
||||
GroupSegmentFixedSize = 6,
|
||||
PrivateSegmentFixedSize = 7,
|
||||
KernargSegmentAlign = 8,
|
||||
WavefrontSize = 9,
|
||||
NumSGPRs = 10,
|
||||
NumVGPRs = 11,
|
||||
MaxFlatWorkGroupSize = 12,
|
||||
NumSpilledSGPRs = 13,
|
||||
NumSpilledVGPRs = 14
|
||||
};
|
||||
|
||||
static const std::map<std::string,ArgField> ArgFieldMapV3 =
|
||||
{
|
||||
{".name", ArgField::Name},
|
||||
{".type_name", ArgField::TypeName},
|
||||
{".size", ArgField::Size},
|
||||
{".offset", ArgField::Offset},
|
||||
{".value_kind", ArgField::ValueKind},
|
||||
{".vaule_type", ArgField::ValueType},
|
||||
{".pointee_align", ArgField::PointeeAlign},
|
||||
{".address_space", ArgField::AddrSpaceQual},
|
||||
{".access", ArgField::AccQual},
|
||||
{".actual_access", ArgField::ActualAccQual},
|
||||
{".is_const", ArgField::IsConst},
|
||||
{".is_restrict", ArgField::IsRestrict},
|
||||
{".is_volatile", ArgField::IsVolatile},
|
||||
{".is_pipe", ArgField::IsPipe}
|
||||
};
|
||||
|
||||
static const std::map<std::string,ValueKind> ArgValueKindV3 =
|
||||
{
|
||||
{"by_value", ValueKind::ByValue},
|
||||
{"global_buffer", ValueKind::GlobalBuffer},
|
||||
{"dynamic_shared_pointer", ValueKind::DynamicSharedPointer},
|
||||
{"sampler", ValueKind::Sampler},
|
||||
{"image", ValueKind::Image},
|
||||
{"pipe", ValueKind::Pipe},
|
||||
{"queue", ValueKind::Queue},
|
||||
{"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX},
|
||||
{"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY},
|
||||
{"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ},
|
||||
{"hidden_none", ValueKind::HiddenNone},
|
||||
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
|
||||
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
|
||||
{"hidden_completion_action", ValueKind::HiddenCompletionAction}
|
||||
};
|
||||
|
||||
static const std::map<std::string,ValueType> ArgValueTypeV3 =
|
||||
{
|
||||
{"struct", ValueType::Struct},
|
||||
{"i8", ValueType::I8},
|
||||
{"u8", ValueType::U8},
|
||||
{"i16", ValueType::I16},
|
||||
{"u16", ValueType::U16},
|
||||
{"f16", ValueType::F16},
|
||||
{"i32", ValueType::I32},
|
||||
{"u32", ValueType::U32},
|
||||
{"f32", ValueType::F32},
|
||||
{"i64", ValueType::I64},
|
||||
{"u64", ValueType::U64},
|
||||
{"f64", ValueType::F64}
|
||||
};
|
||||
|
||||
static const std::map<std::string,AccessQualifier> ArgAccQualV3 =
|
||||
{
|
||||
{"default", AccessQualifier::Default},
|
||||
{"read_only", AccessQualifier::ReadOnly},
|
||||
{"write_only", AccessQualifier::WriteOnly},
|
||||
{"read_write", AccessQualifier::ReadWrite}
|
||||
};
|
||||
|
||||
static const std::map<std::string,AddressSpaceQualifier> ArgAddrSpaceQualV3 =
|
||||
{
|
||||
{"private", AddressSpaceQualifier::Private},
|
||||
{"global", AddressSpaceQualifier::Global},
|
||||
{"constant", AddressSpaceQualifier::Constant},
|
||||
{"local", AddressSpaceQualifier::Local},
|
||||
{"generic", AddressSpaceQualifier::Generic},
|
||||
{"region", AddressSpaceQualifier::Region}
|
||||
};
|
||||
|
||||
static const std::map<std::string,KernelField> KernelFieldMapV3 =
|
||||
{
|
||||
{".symbol", KernelField::SymbolName},
|
||||
{".reqd_workgroup_size", KernelField::ReqdWorkGroupSize},
|
||||
{".workgorup_size_hint", KernelField::WorkGroupSizeHint},
|
||||
{".vec_type_hint", KernelField::VecTypeHint},
|
||||
{".device_enqueue_symbol", KernelField::DeviceEnqueueSymbol},
|
||||
{".kernarg_segment_size", KernelField::KernargSegmentSize},
|
||||
{".group_segment_fixed_size", KernelField::GroupSegmentFixedSize},
|
||||
{".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize},
|
||||
{".kernarg_segment_align", KernelField::KernargSegmentAlign},
|
||||
{".wavefront_size", KernelField::WavefrontSize},
|
||||
{".sgpr_count", KernelField::NumSGPRs},
|
||||
{".vgpr_count", KernelField::NumVGPRs},
|
||||
{".max_flat_workgroup_size", KernelField::MaxFlatWorkGroupSize},
|
||||
{".sgpr_spill_count", KernelField::NumSpilledSGPRs},
|
||||
{".vgpr_spill_count", KernelField::NumSpilledVGPRs}
|
||||
};
|
||||
|
||||
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
|
||||
@@ -234,6 +346,8 @@ struct KernelParameterDescriptor {
|
||||
|
||||
namespace device {
|
||||
|
||||
class Program;
|
||||
|
||||
//! Printf info structure
|
||||
struct PrintfInfo {
|
||||
std::string fmtString_; //!< formated string for printf
|
||||
@@ -272,7 +386,7 @@ class Kernel : public amd::HeapObject {
|
||||
};
|
||||
|
||||
//! Default constructor
|
||||
Kernel(const amd::Device& dev, const std::string& name);
|
||||
Kernel(const amd::Device& dev, const std::string& name, const Program& prog);
|
||||
|
||||
//! Default destructor
|
||||
virtual ~Kernel();
|
||||
@@ -372,7 +486,7 @@ class Kernel : public amd::HeapObject {
|
||||
//! Initializes the abstraction layer kernel parameters
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize);
|
||||
void InitParameters(const amd_comgr_metadata_node_t kernelMD);
|
||||
|
||||
//! Get ther kernel metadata
|
||||
bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD,
|
||||
@@ -381,7 +495,6 @@ class Kernel : public amd::HeapObject {
|
||||
|
||||
//! Retrieve kernel attribute and code properties metadata
|
||||
bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
|
||||
const uint32_t kernargSegmentByteSize,
|
||||
KernelMD* kernelMD);
|
||||
|
||||
//! Retrieve the available SGPRs and VGPRs
|
||||
@@ -390,6 +503,12 @@ class Kernel : public amd::HeapObject {
|
||||
//! Retrieve the printf string metadata
|
||||
bool GetPrintfStr(const amd_comgr_metadata_node_t programMD,
|
||||
std::vector<std::string>* printfStr);
|
||||
|
||||
//! Returns the kernel symbol name
|
||||
const std::string& symbolName() const { return symbolName_; }
|
||||
|
||||
//! Returns the kernel code object version
|
||||
const uint32_t codeObjectVer() const { return prog().codeObjectVer(); }
|
||||
#else
|
||||
void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
|
||||
#endif
|
||||
@@ -404,8 +523,13 @@ class Kernel : public amd::HeapObject {
|
||||
//! Initializes HSAIL Printf metadata and info
|
||||
void InitPrintf(const aclPrintfFmt* aclPrintf);
|
||||
#endif
|
||||
//! Returns program associated with this kernel
|
||||
const Program& prog() const { return prog_; }
|
||||
|
||||
const amd::Device& dev_; //!< GPU device object
|
||||
std::string name_; //!< kernel name
|
||||
const Program& prog_; //!< Reference to the parent program
|
||||
std::string symbolName_; //!< kernel symbol name
|
||||
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
|
||||
amd::KernelSignature* signature_; //!< kernel signature
|
||||
std::string buildLog_; //!< build log
|
||||
@@ -424,6 +548,7 @@ class Kernel : public amd::HeapObject {
|
||||
Flags() : value_(0) {}
|
||||
} flags_;
|
||||
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
Kernel(const Kernel&);
|
||||
@@ -447,264 +572,5 @@ static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta,
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
size_t size = 0;
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itArgField = ArgFieldMap.find(buf);
|
||||
if (itArgField == ArgFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// get the value of the argument field
|
||||
status = getMetaBuf(value, &buf);
|
||||
|
||||
KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
|
||||
|
||||
switch (itArgField->second) {
|
||||
case ArgField::Name:
|
||||
lcArg->mName = buf;
|
||||
break;
|
||||
case ArgField::TypeName:
|
||||
lcArg->mTypeName = buf;
|
||||
break;
|
||||
case ArgField::Size:
|
||||
lcArg->mSize = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::Align:
|
||||
lcArg->mAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::ValueKind:
|
||||
{
|
||||
auto itValueKind = ArgValueKind.find(buf);
|
||||
if (itValueKind == ArgValueKind.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueKind = itValueKind->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ValueType:
|
||||
{
|
||||
auto itValueType = ArgValueType.find(buf);
|
||||
if (itValueType == ArgValueType.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mValueType = itValueType->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::PointeeAlign:
|
||||
lcArg->mPointeeAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case ArgField::AddrSpaceQual:
|
||||
{
|
||||
auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
|
||||
if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::AccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQual.find(buf);
|
||||
if (itAccQual == ArgAccQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::ActualAccQual:
|
||||
{
|
||||
auto itAccQual = ArgAccQual.find(buf);
|
||||
if (itAccQual == ArgAccQual.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
lcArg->mActualAccQual = itAccQual->second;
|
||||
}
|
||||
break;
|
||||
case ArgField::IsConst:
|
||||
lcArg->mIsConst = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsRestrict:
|
||||
lcArg->mIsRestrict = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsVolatile:
|
||||
lcArg->mIsVolatile = (buf.compare("true") == 0);
|
||||
break;
|
||||
case ArgField::IsPipe:
|
||||
lcArg->mIsPipe = (buf.compare("true") == 0);
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
size_t size = 0;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itAttrField = AttrFieldMap.find(buf);
|
||||
if (itAttrField == AttrFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
KernelMD* kernelMD = static_cast<KernelMD*>(data);
|
||||
switch (itAttrField->second) {
|
||||
case AttrField::ReqdWorkGroupSize:
|
||||
{
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSize;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::WorkGroupSizeHint:
|
||||
{
|
||||
status = amd::Comgr::get_metadata_list_size(value, &size);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
amd_comgr_metadata_node_t workgroupSizeHint;
|
||||
status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS &&
|
||||
getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
|
||||
}
|
||||
amd::Comgr::destroy_metadata(workgroupSizeHint);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::VecTypeHint:
|
||||
{
|
||||
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mVecTypeHint = buf;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AttrField::RuntimeHandle:
|
||||
{
|
||||
if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
|
||||
kernelMD->mAttrs.mRuntimeHandle = buf;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
|
||||
const amd_comgr_metadata_node_t value,
|
||||
void *data) {
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_metadata_kind_t kind;
|
||||
std::string buf;
|
||||
|
||||
// get the key of the argument field
|
||||
status = amd::Comgr::get_metadata_kind(key, &kind);
|
||||
if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(key, &buf);
|
||||
}
|
||||
|
||||
if (status != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
auto itCodePropField = CodePropFieldMap.find(buf);
|
||||
if (itCodePropField == CodePropFieldMap.end()) {
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// get the value of the argument field
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = getMetaBuf(value, &buf);
|
||||
}
|
||||
|
||||
KernelMD* kernelMD = static_cast<KernelMD*>(data);
|
||||
switch (itCodePropField->second) {
|
||||
case CodePropField::KernargSegmentSize:
|
||||
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::GroupSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::PrivateSegmentFixedSize:
|
||||
kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::KernargSegmentAlign:
|
||||
kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::WavefrontSize:
|
||||
kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumSGPRs:
|
||||
kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumVGPRs:
|
||||
kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::MaxFlatWorkGroupSize:
|
||||
kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::IsDynamicCallStack:
|
||||
kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
|
||||
break;
|
||||
case CodePropField::IsXNACKEnabled:
|
||||
kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
|
||||
break;
|
||||
case CodePropField::NumSpilledSGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
case CodePropField::NumSpilledVGPRs:
|
||||
kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
|
||||
break;
|
||||
default:
|
||||
return AMD_COMGR_STATUS_ERROR;
|
||||
}
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
} // namespace device
|
||||
|
||||
@@ -706,8 +706,6 @@ bool Program::compileImplLC(const std::string& sourceCode,
|
||||
driverOptions.append(options->llvmOptions);
|
||||
driverOptions.append(ProcessOptions(options));
|
||||
|
||||
// Force object code v2.
|
||||
driverOptions.append(" -mno-code-object-v3");
|
||||
// Set whole program mode
|
||||
driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink");
|
||||
|
||||
@@ -1532,8 +1530,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
|
||||
codegenOptions.append(" -mno-sram-ecc");
|
||||
}
|
||||
|
||||
// Force object code v2.
|
||||
codegenOptions.append(" -mno-code-object-v3");
|
||||
// Set whole program mode
|
||||
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
|
||||
|
||||
@@ -1764,8 +1760,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
|
||||
std::ostream_iterator<std::string>(ostrstr, " "));
|
||||
codegenOptions.append(" ").append(ostrstr.str());
|
||||
|
||||
// Force object code v2.
|
||||
codegenOptions.append(" -mno-code-object-v3");
|
||||
// Set whole program mode
|
||||
codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");
|
||||
|
||||
@@ -2880,8 +2874,19 @@ bool Program::createKernelMetadataMap() {
|
||||
|
||||
status = amd::Comgr::metadata_lookup(*metadata_, "Kernels", &kernelsMD);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
LogInfo("Using Code Object V2.");
|
||||
hasKernelMD = true;
|
||||
status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
|
||||
codeObjectVer_ = 2;
|
||||
}
|
||||
else {
|
||||
status = amd::Comgr::metadata_lookup(*metadata_, "amdhsa.kernels", &kernelsMD);
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
LogInfo("Using Code Object V3.");
|
||||
hasKernelMD = true;
|
||||
codeObjectVer_ = 3;
|
||||
status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
|
||||
@@ -2896,7 +2901,9 @@ bool Program::createKernelMetadataMap() {
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
hasKernelNode = true;
|
||||
status = amd::Comgr::metadata_lookup(kernelNode, "Name", &nameMeta);
|
||||
status = amd::Comgr::metadata_lookup(kernelNode,
|
||||
(codeObjectVer() == 2) ? "Name" : ".name",
|
||||
&nameMeta);
|
||||
}
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
@@ -2970,9 +2977,10 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) {
|
||||
buildLog_ += "Error: object code with old metadata is not supported\n";
|
||||
return false;
|
||||
}
|
||||
else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
|
||||
note->n_namesz == sizeof "AMD" &&
|
||||
!memcmp(name, "AMD", note->n_namesz)) {
|
||||
else if ((note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
|
||||
note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) ||
|
||||
(note->n_type == 32 /* NT_AMD_AMDGPU_HSA_METADATA V3 */ &&
|
||||
note->n_namesz == sizeof "AMDGPU" && !memcmp(name, "AMDGPU", note->n_namesz))) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
amd_comgr_status_t status;
|
||||
amd_comgr_data_t binaryData;
|
||||
|
||||
@@ -118,6 +118,7 @@ class Program : public amd::HeapObject {
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
amd_comgr_metadata_node_t* metadata_; //!< COMgr metadata
|
||||
uint32_t codeObjectVer_; //!< version of code object
|
||||
std::map<std::string,amd_comgr_metadata_node_t> kernelMetadataMap_; //!< Map of kernel metadata
|
||||
#else
|
||||
CodeObjectMD* metadata_; //!< Runtime metadata
|
||||
@@ -211,6 +212,8 @@ class Program : public amd::HeapObject {
|
||||
auto it = kernelMetadataMap_.find(name);
|
||||
return (it == kernelMetadataMap_.end()) ? nullptr : &(it->second);
|
||||
}
|
||||
|
||||
const uint32_t codeObjectVer() const { return codeObjectVer_; }
|
||||
#else
|
||||
const CodeObjectMD* metadata() const { return metadata_; }
|
||||
#endif
|
||||
|
||||
@@ -515,10 +515,9 @@ clk_value_type_t KernelArg::type() const {
|
||||
|
||||
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
|
||||
const NullProgram& nullprog)
|
||||
: device::Kernel(gpuNullDev, name),
|
||||
: device::Kernel(gpuNullDev, name, nullprog),
|
||||
buildError_(CL_BUILD_PROGRAM_FAILURE),
|
||||
gpuDev_(gpuNullDev),
|
||||
prog_(nullprog),
|
||||
calRef_(NULL),
|
||||
internal_(false),
|
||||
flags_(0),
|
||||
@@ -3027,9 +3026,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
||||
|
||||
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
|
||||
uint extraArgsNum)
|
||||
: device::Kernel(prog->dev(), name),
|
||||
: device::Kernel(prog->dev(), name, *prog),
|
||||
compileOptions_(compileOptions),
|
||||
prog_(*prog),
|
||||
index_(0),
|
||||
code_(NULL),
|
||||
codeSize_(0),
|
||||
|
||||
@@ -406,7 +406,7 @@ class NullKernel : public device::Kernel {
|
||||
const NullDevice& nullDev() const { return gpuDev_; }
|
||||
|
||||
//! Returns GPU device object, associated with this kernel
|
||||
const NullProgram& nullProg() const { return prog_; }
|
||||
const NullProgram& nullProg() const { return reinterpret_cast<const NullProgram&>(prog_); }
|
||||
|
||||
//! Returns the kernel's build error
|
||||
const cl_int buildError() const { return buildError_; }
|
||||
@@ -455,7 +455,6 @@ class NullKernel : public device::Kernel {
|
||||
std::string ilSource_; //!< IL source code of this kernel
|
||||
|
||||
const NullDevice& gpuDev_; //!< GPU device object
|
||||
const NullProgram& prog_; //!< Reference to the parent program
|
||||
|
||||
CalImageReference* calRef_; //!< CAL image reference for this kernel
|
||||
bool internal_; //!< Runtime internal ker
|
||||
@@ -847,7 +846,6 @@ class HSAILKernel : public device::Kernel {
|
||||
std::vector<Argument*> arguments_; //!< Vector list of HSAIL Arguments
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
|
||||
const HSAILProgram& prog_; //!< Reference to the parent program
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
gpu::Memory* code_; //!< Memory object with ISA code
|
||||
|
||||
@@ -26,37 +26,17 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
|
||||
|
||||
namespace pal {
|
||||
|
||||
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
if (!sym) {
|
||||
return false;
|
||||
}
|
||||
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
||||
reinterpret_cast<void*>(&codeSize_))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
amd_kernel_code_t* akc = &akc_;
|
||||
// Copy codeobject of this kernel from the program CPU segment
|
||||
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
|
||||
|
||||
size_t akc_align = 0;
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
|
||||
reinterpret_cast<void*>(&akc_align))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
|
||||
const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs,
|
||||
const uint16_t numVGPRs) {
|
||||
workGroupInfo_.scratchRegs_ =
|
||||
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
||||
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
||||
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
|
||||
akc->workgroup_group_segment_byte_size;
|
||||
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
||||
amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
|
||||
workGroupInfo_.privateMemSize_ = privateSegmentSize;
|
||||
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
|
||||
workGroupInfo_.usedSGPRs_ = numSGPRs;
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
||||
workGroupInfo_.usedVGPRs_ = numVGPRs;
|
||||
|
||||
if (!prog().isNull()) {
|
||||
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
|
||||
@@ -72,16 +52,57 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
workGroupInfo_.availableVGPRs_ = 256;
|
||||
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
|
||||
}
|
||||
}
|
||||
|
||||
bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
|
||||
if (!sym) {
|
||||
return false;
|
||||
}
|
||||
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy code object of this kernel from the program CPU segment
|
||||
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
|
||||
amd_kernel_code_t* akc = &akc_;
|
||||
|
||||
if (!setKernelCode(sym, akc)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
||||
reinterpret_cast<void*>(&codeSize_))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Setup the the workgroup info
|
||||
setWorkGroupInfo(akc->workitem_private_segment_byte_size,
|
||||
akc->workgroup_group_segment_byte_size,
|
||||
akc->wavefront_sgpr_count,
|
||||
akc->workitem_vgpr_count);
|
||||
|
||||
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
|
||||
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
|
||||
spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
|
||||
: device::Kernel(prog->dev(), name),
|
||||
: device::Kernel(prog->dev(), name, *prog),
|
||||
compileOptions_(compileOptions),
|
||||
prog_(*prog),
|
||||
index_(0),
|
||||
code_(0),
|
||||
codeSize_(0)
|
||||
codeSize_(0),
|
||||
workgroupGroupSegmentByteSize_(0),
|
||||
kernargSegmentByteSize_(0),
|
||||
spillSegmentByteSize_(0)
|
||||
{
|
||||
flags_.hsa_ = true;
|
||||
}
|
||||
@@ -388,37 +409,54 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
|
||||
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
flags_.internalKernel_ =
|
||||
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
|
||||
aqlCreateHWInfo(symbol);
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
|
||||
if (kernelMetaNode == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelMD kernelMD;
|
||||
if (!GetAttrCodePropMetadata(*kernelMetaNode, argsBufferSize(), &kernelMD)) {
|
||||
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
|
||||
|
||||
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
|
||||
spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
|
||||
sizeof(uint32_t));
|
||||
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
|
||||
|
||||
// Copy codeobject of this kernel from the program CPU segment
|
||||
hsa_agent_t agent;
|
||||
agent.handle = 1;
|
||||
|
||||
auto sym = prog().GetSymbol(symbolName().c_str(), const_cast<hsa_agent_t*>(&agent));
|
||||
|
||||
if (!setKernelCode(sym, &akc_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
codeSize_ = prog().codeSegGpu().owner()->getSize();
|
||||
|
||||
// handle device enqueue
|
||||
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
|
||||
hsa_agent_t agent;
|
||||
agent.handle = 1;
|
||||
amd::hsa::loader::Symbol* rth_symbol;
|
||||
|
||||
// Get the runtime handle symbol GPU address
|
||||
rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
|
||||
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
uint64_t symbol_address;
|
||||
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
||||
|
||||
// Copy the kernel_object pointer to the runtime handle symbol GPU address
|
||||
const Memory& codeSegGpu = prog_.codeSegGpu();
|
||||
const Memory& codeSegGpu = prog().codeSegGpu();
|
||||
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
|
||||
uint64_t kernel_object = gpuAqlCode();
|
||||
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
||||
@@ -432,6 +470,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
|
||||
}
|
||||
|
||||
// Setup the the workgroup info
|
||||
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
|
||||
kernelMD.mCodeProps.mGroupSegmentFixedSize,
|
||||
kernelMD.mCodeProps.mNumSGPRs,
|
||||
kernelMD.mCodeProps.mNumVGPRs);
|
||||
|
||||
// Copy wavefront size
|
||||
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
||||
|
||||
@@ -452,7 +496,18 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
if (!printfStr.empty()) {
|
||||
InitPrintf(printfStr);
|
||||
}
|
||||
#else
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
|
||||
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
flags_.internalKernel_ =
|
||||
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
|
||||
aqlCreateHWInfo(symbol);
|
||||
|
||||
const CodeObjectMD* programMD = prog().metadata();
|
||||
assert(programMD != nullptr);
|
||||
|
||||
@@ -489,13 +544,13 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
amd::hsa::loader::Symbol* rth_symbol;
|
||||
|
||||
// Get the runtime handle symbol GPU address
|
||||
rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
|
||||
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
|
||||
const_cast<hsa_agent_t*>(&agent));
|
||||
uint64_t symbol_address;
|
||||
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
||||
|
||||
// Copy the kernel_object pointer to the runtime handle symbol GPU address
|
||||
const Memory& codeSegGpu = prog_.codeSegGpu();
|
||||
const Memory& codeSegGpu = prog().codeSegGpu();
|
||||
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
|
||||
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
||||
|
||||
@@ -529,8 +584,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
|
||||
waveLimiter_.enable();
|
||||
*/
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ class HSAILKernel : public device::Kernel {
|
||||
const HSAILProgram& prog() const;
|
||||
|
||||
//! Returns LDS size used in this kernel
|
||||
uint32_t ldsSize() const { return akc_.workgroup_group_segment_byte_size; }
|
||||
uint32_t ldsSize() const { return workgroupGroupSegmentByteSize_; }
|
||||
|
||||
//! Returns pointer on CPU to AQL code info
|
||||
const amd_kernel_code_t* cpuAqlCode() const { return &akc_; }
|
||||
@@ -71,10 +71,10 @@ class HSAILKernel : public device::Kernel {
|
||||
size_t aqlCodeSize() const { return codeSize_; }
|
||||
|
||||
//! Returns the size of argument buffer
|
||||
size_t argsBufferSize() const { return akc_.kernarg_segment_byte_size; }
|
||||
size_t argsBufferSize() const { return kernargSegmentByteSize_; }
|
||||
|
||||
//! Returns spill reg size per workitem
|
||||
uint32_t spillSegSize() const { return amd::alignUp(akc_.workitem_private_segment_byte_size, sizeof(uint32_t)); }
|
||||
uint32_t spillSegSize() const { return spillSegmentByteSize_; }
|
||||
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! if the kernel arguments were successfully loaded, otherwise NULL
|
||||
@@ -102,13 +102,25 @@ class HSAILKernel : public device::Kernel {
|
||||
//! Creates AQL kernel HW info
|
||||
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
|
||||
|
||||
//! Get the kernel code and copy the code object from the program CPU segment
|
||||
bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
|
||||
|
||||
//! Set up the workgroup info based on the kernel metadata
|
||||
void setWorkGroupInfo(const uint32_t privateSegmentSize,
|
||||
const uint32_t groupSegmentSize,
|
||||
const uint16_t numSGPRs,
|
||||
const uint16_t numVGPRs);
|
||||
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t akc_; //!< AQL kernel code on CPU
|
||||
const HSAILProgram& prog_; //!< Reference to the parent program
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
uint32_t workgroupGroupSegmentByteSize_; //!< LDS size used in the kernel
|
||||
uint32_t kernargSegmentByteSize_; //!< Size of kernel argument buffer
|
||||
uint32_t spillSegmentByteSize_; //!< Spill reg size per workitem
|
||||
};
|
||||
|
||||
class LightningKernel : public HSAILKernel {
|
||||
@@ -121,6 +133,11 @@ class LightningKernel : public HSAILKernel {
|
||||
|
||||
//! Initializes the metadata required for this kernel,
|
||||
bool init(amd::hsa::loader::Symbol* symbol);
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
//! Initializes the metadata required for this kernel,
|
||||
bool init();
|
||||
#endif
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
@@ -747,6 +747,25 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel = new LightningKernel(kernelName, this,
|
||||
options->origOptionStr + ProcessOptions(options));
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
if (!kernel->init()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
|
||||
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
||||
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
||||
maxScratchRegs_ =
|
||||
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
}
|
||||
#else
|
||||
// Get the list of kernels
|
||||
std::vector<std::string> kernelNameList;
|
||||
status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList);
|
||||
@@ -781,7 +800,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
maxScratchRegs_ =
|
||||
std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
}
|
||||
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
DestroySegmentCpuAccess();
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
|
||||
@@ -166,7 +166,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
}
|
||||
|
||||
// Update GPU specific settings and info structure if we have any
|
||||
#if defined(_WIN32)
|
||||
ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
|
||||
#else
|
||||
ModifyMaxWorkload modifyMaxWorkload = {0};
|
||||
#endif
|
||||
|
||||
// APU systems
|
||||
if (palProp.gpuType == Pal::GpuType::Integrated) {
|
||||
@@ -326,18 +330,22 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
if (modifyMaxWorkload.time > 0) {
|
||||
OSVERSIONINFOEX versionInfo = {0};
|
||||
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
|
||||
versionInfo.dwMajorVersion = 6;
|
||||
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
|
||||
splitSizeForWin7_ = false;
|
||||
|
||||
DWORDLONG conditionMask = 0;
|
||||
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time;
|
||||
#if defined(_WIN32)
|
||||
OSVERSIONINFOEX versionInfo = {0};
|
||||
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
|
||||
versionInfo.dwMajorVersion = 6;
|
||||
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
|
||||
|
||||
DWORDLONG conditionMask = 0;
|
||||
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
|
||||
|
||||
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
|
||||
splitSizeForWin7_ = true; // Update flag of DMA flush split size for Win 7
|
||||
if (modifyMaxWorkload.time > 0) {
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
|
||||
}
|
||||
}
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
@@ -60,7 +60,8 @@ class Settings : public device::Settings {
|
||||
uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround
|
||||
uint rgpSqttWaitIdle_: 1; //!< Wait for idle after SQTT trace
|
||||
uint rgpSqttForceDisable_: 1; //!< Disables SQTT
|
||||
uint reserved_ : 12;
|
||||
uint splitSizeForWin7_: 1; //!< DMA flush split size for Win 7
|
||||
uint reserved_ : 11;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -553,6 +553,11 @@ void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
|
||||
|
||||
void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
|
||||
uint instructions) {
|
||||
if (!dev.settings().splitSizeForWin7_) {
|
||||
dispatchSplitSize_ = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t workload = threads * instructions;
|
||||
if (maxDispatchWorkload_ < workload) {
|
||||
dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
|
||||
@@ -2258,6 +2263,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
|
||||
enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
// Run AQL dispatch in HW
|
||||
eventBegin(MainEngine);
|
||||
iCmd()->CmdDispatchAql(dispatchParam);
|
||||
|
||||
@@ -25,41 +25,58 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment)
|
||||
: device::Kernel(prog->dev(), name),
|
||||
program_(prog),
|
||||
: device::Kernel(prog->dev(), name, *prog),
|
||||
kernelCodeHandle_(kernelCodeHandle),
|
||||
workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
|
||||
workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize),
|
||||
kernargSegmentByteSize_(kernargSegmentByteSize),
|
||||
kernargSegmentAlignment_(kernargSegmentAlignment) {}
|
||||
|
||||
Kernel::Kernel(std::string name, Program* prog)
|
||||
: device::Kernel(prog->dev(), name, *prog),
|
||||
kernelCodeHandle_(0),
|
||||
workgroupGroupSegmentByteSize_(0),
|
||||
workitemPrivateSegmentByteSize_(0),
|
||||
kernargSegmentByteSize_(0),
|
||||
kernargSegmentAlignment_(0) {}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
|
||||
hsa_agent_t hsaDevice = program_->hsaDevice();
|
||||
hsa_agent_t hsaDevice = program()->hsaDevice();
|
||||
|
||||
const amd_comgr_metadata_node_t* kernelMetaNode =
|
||||
static_cast<LightningProgram*>(program_)->getKernelMetadata(name());
|
||||
static_cast<const LightningProgram*>(program())->getKernelMetadata(name());
|
||||
if (kernelMetaNode == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelMD kernelMD;
|
||||
if (!GetAttrCodePropMetadata(*kernelMetaNode, KernargSegmentByteSize(), &kernelMD)) {
|
||||
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set the kernel symbol name and size/alignment based on the kernel metadata
|
||||
// NOTE: kernel name is used to get the kernel code handle in V2,
|
||||
// but kernel symbol name is used in V3
|
||||
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
|
||||
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
|
||||
workitemPrivateSegmentByteSize_ = kernelMD.mCodeProps.mPrivateSegmentFixedSize;
|
||||
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
|
||||
kernargSegmentAlignment_ = amd::alignUp(std::max(kernelMD.mCodeProps.mKernargSegmentAlign, 128u),
|
||||
dev().info().globalMemCacheLineSize_);
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
|
||||
assert(workGroupInfo_.availableLDSSize_ > 0);
|
||||
|
||||
// Get the available SGPRs and VGPRs
|
||||
std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget();
|
||||
if (program_->xnackEnable()) {
|
||||
std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program()->machineTarget();
|
||||
if (program()->xnackEnable()) {
|
||||
targetIdent.append("+xnack");
|
||||
}
|
||||
if (program_->sramEccEnable()) {
|
||||
if (program()->sramEccEnable()) {
|
||||
targetIdent.append("+sram-ecc");
|
||||
}
|
||||
|
||||
@@ -67,10 +84,23 @@ bool LightningKernel::init() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the kernel code handle
|
||||
hsa_status_t hsaStatus;
|
||||
hsa_executable_symbol_t symbol;
|
||||
hsa_agent_t agent = program()->hsaDevice();
|
||||
hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
|
||||
symbolName().c_str(),
|
||||
&agent, &symbol);
|
||||
if (hsaStatus == HSA_STATUS_SUCCESS) {
|
||||
hsaStatus = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
||||
&kernelCodeHandle_);
|
||||
}
|
||||
if (hsaStatus != HSA_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
|
||||
hsa_agent_t agent = program_->hsaDevice();
|
||||
hsa_executable_symbol_t kernelSymbol;
|
||||
hsa_status_t hsaStatus;
|
||||
int variable_size;
|
||||
uint64_t variable_address;
|
||||
|
||||
@@ -79,7 +109,7 @@ bool LightningKernel::init() {
|
||||
// object handle of such a kernel. The address of the variable and the kernel code object handle are known
|
||||
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
|
||||
// address of the variable.
|
||||
hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(),
|
||||
hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
|
||||
kernelMD.mAttrs.mRuntimeHandle.c_str(),
|
||||
&agent, &kernelSymbol);
|
||||
if (hsaStatus == HSA_STATUS_SUCCESS) {
|
||||
@@ -87,11 +117,6 @@ bool LightningKernel::init() {
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
|
||||
&variable_size);
|
||||
}
|
||||
if (hsaStatus == HSA_STATUS_SUCCESS) {
|
||||
hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
|
||||
&variable_size);
|
||||
}
|
||||
if (hsaStatus == HSA_STATUS_SUCCESS) {
|
||||
hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
||||
@@ -114,7 +139,7 @@ bool LightningKernel::init() {
|
||||
}
|
||||
|
||||
uint32_t wavefront_size = 0;
|
||||
if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
|
||||
if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
@@ -127,7 +152,7 @@ bool LightningKernel::init() {
|
||||
workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs;
|
||||
workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs;
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
workGroupInfo_.wavefrontSize_ = wavefront_size;
|
||||
workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
|
||||
if (workGroupInfo_.size_ == 0) {
|
||||
@@ -135,7 +160,7 @@ bool LightningKernel::init() {
|
||||
}
|
||||
|
||||
// handle the printf metadata if any
|
||||
const amd_comgr_metadata_node_t* programMD = static_cast<LightningProgram*>(program_)->metadata();
|
||||
const amd_comgr_metadata_node_t* programMD = static_cast<const LightningProgram*>(program())->metadata();
|
||||
assert(programMD != nullptr);
|
||||
|
||||
std::vector<std::string> printfStr;
|
||||
@@ -159,10 +184,10 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
|
||||
}
|
||||
|
||||
bool LightningKernel::init() {
|
||||
hsa_agent_t hsaDevice = program_->hsaDevice();
|
||||
hsa_agent_t hsaDevice = program()->hsaDevice();
|
||||
|
||||
// Pull out metadata from the ELF
|
||||
const CodeObjectMD* programMD = static_cast<LightningProgram*>(program_)->metadata();
|
||||
const CodeObjectMD* programMD = static_cast<const LightningProgram*>(program())->metadata();
|
||||
assert(programMD != nullptr);
|
||||
|
||||
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
|
||||
@@ -172,7 +197,7 @@ bool LightningKernel::init() {
|
||||
InitParameters(*kernelMD, KernargSegmentByteSize());
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
|
||||
workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
|
||||
assert(workGroupInfo_.availableLDSSize_ > 0);
|
||||
workGroupInfo_.availableSGPRs_ = 104;
|
||||
workGroupInfo_.availableVGPRs_ = 256;
|
||||
@@ -196,7 +221,7 @@ bool LightningKernel::init() {
|
||||
}
|
||||
|
||||
if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
|
||||
hsa_agent_t agent = program_->hsaDevice();
|
||||
hsa_agent_t agent = program()->hsaDevice();
|
||||
hsa_executable_symbol_t kernelSymbol;
|
||||
hsa_status_t status;
|
||||
int variable_size;
|
||||
@@ -208,7 +233,7 @@ bool LightningKernel::init() {
|
||||
// only after the hsa executable is loaded. The below code copies the kernel code object handle to the
|
||||
// address of the variable.
|
||||
|
||||
status = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
|
||||
status = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
|
||||
&agent, &kernelSymbol);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return false;
|
||||
@@ -239,7 +264,7 @@ bool LightningKernel::init() {
|
||||
}
|
||||
|
||||
uint32_t wavefront_size = 0;
|
||||
if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
|
||||
if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
@@ -258,7 +283,7 @@ bool LightningKernel::init() {
|
||||
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
|
||||
workGroupInfo_.wavefrontSize_ = wavefront_size;
|
||||
|
||||
@@ -278,18 +303,18 @@ bool LightningKernel::init() {
|
||||
bool HSAILKernel::init() {
|
||||
acl_error errorCode;
|
||||
// compile kernel down to ISA
|
||||
hsa_agent_t hsaDevice = program_->hsaDevice();
|
||||
hsa_agent_t hsaDevice = program()->hsaDevice();
|
||||
// Pull out metadata from the ELF
|
||||
size_t sizeOfArgList;
|
||||
aclCompiler* compileHandle = program_->dev().compiler();
|
||||
aclCompiler* compileHandle = program()->dev().compiler();
|
||||
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
|
||||
openClKernelName.c_str(), nullptr, &sizeOfArgList);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
std::unique_ptr<char[]> argList(new char[sizeOfArgList]);
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
|
||||
openClKernelName.c_str(), argList.get(), &sizeOfArgList);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
return false;
|
||||
@@ -300,17 +325,17 @@ bool HSAILKernel::init() {
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
|
||||
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
|
||||
workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
|
||||
assert(workGroupInfo_.availableLDSSize_ > 0);
|
||||
workGroupInfo_.availableSGPRs_ = 104;
|
||||
workGroupInfo_.availableVGPRs_ = 256;
|
||||
size_t sizeOfWorkGroupSize;
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
|
||||
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
|
||||
openClKernelName.c_str(), workGroupInfo_.compileSize_,
|
||||
&sizeOfWorkGroupSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
@@ -319,7 +344,7 @@ bool HSAILKernel::init() {
|
||||
|
||||
uint32_t wavefront_size = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
|
||||
hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
|
||||
return false;
|
||||
}
|
||||
assert(wavefront_size > 0);
|
||||
@@ -344,18 +369,18 @@ bool HSAILKernel::init() {
|
||||
}
|
||||
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
|
||||
workGroupInfo_.wavefrontSize_ = wavefront_size;
|
||||
if (workGroupInfo_.compileSize_[0] != 0) {
|
||||
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
||||
workGroupInfo_.compileSize_[2];
|
||||
} else {
|
||||
workGroupInfo_.size_ = program_->dev().info().preferredWorkGroupSize_;
|
||||
workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_;
|
||||
}
|
||||
|
||||
// Pull out printf metadata from the ELF
|
||||
size_t sizeOfPrintfList;
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY,
|
||||
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
return false;
|
||||
@@ -367,7 +392,7 @@ bool HSAILKernel::init() {
|
||||
if (!aclPrintfList) {
|
||||
return false;
|
||||
}
|
||||
errorCode = aclQueryInfo(compileHandle, program_->binaryElf(),
|
||||
errorCode = aclQueryInfo(compileHandle, program()->binaryElf(),
|
||||
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(),
|
||||
aclPrintfList.get(), &sizeOfPrintfList);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
|
||||
@@ -22,6 +22,8 @@ class Kernel : public device::Kernel {
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment);
|
||||
|
||||
Kernel(std::string name, Program* prog);
|
||||
|
||||
const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; }
|
||||
|
||||
const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; }
|
||||
@@ -37,15 +39,15 @@ class Kernel : public device::Kernel {
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() = 0;
|
||||
|
||||
const Program* program() const { return static_cast<const Program*>(program_); }
|
||||
const Program* program() const { return static_cast<const Program*>(&prog_); }
|
||||
|
||||
protected:
|
||||
Program* program_; //!< The roc::Program context
|
||||
// Program* program_; //!< The roc::Program context
|
||||
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
|
||||
const uint32_t workgroupGroupSegmentByteSize_;
|
||||
const uint32_t workitemPrivateSegmentByteSize_;
|
||||
const uint32_t kernargSegmentByteSize_;
|
||||
const uint32_t kernargSegmentAlignment_;
|
||||
uint32_t workgroupGroupSegmentByteSize_;
|
||||
uint32_t workitemPrivateSegmentByteSize_;
|
||||
uint32_t kernargSegmentByteSize_;
|
||||
uint32_t kernargSegmentAlignment_;
|
||||
size_t kernelDirectiveOffset_;
|
||||
};
|
||||
|
||||
@@ -74,6 +76,10 @@ class LightningKernel : public roc::Kernel {
|
||||
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
|
||||
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
|
||||
}
|
||||
|
||||
LightningKernel(std::string name, Program* prog)
|
||||
: roc::Kernel(name, prog) {}
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() final;
|
||||
};
|
||||
|
||||
@@ -488,6 +488,19 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
const std::string kernelName = kernelMeta.first;
|
||||
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
|
||||
if (!aKernel->init()) {
|
||||
return false;
|
||||
}
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
#else
|
||||
// Get the list of kernels
|
||||
std::vector<std::string> kernelNameList;
|
||||
status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback,
|
||||
@@ -582,6 +595,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
|
||||
std::string::npos);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
||||
return true;
|
||||
}
|
||||
|
||||
새 이슈에서 참조
사용자 차단