From fe73459442a23a66957956217e81fcb0ab315c19 Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 22 Nov 2018 14:04:51 -0500 Subject: [PATCH] P4 to Git Change 1710776 by wchau@wchau_OCL_boltzmann on 2018/11/22 12:54:51 SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#70 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#45 edit [ROCm/clr commit: 8bce4926b3a2c49745de6df0ed0aeb6c96cc3cb5] --- .../clr/rocclr/runtime/device/devkernel.cpp | 304 +++++++++++- .../clr/rocclr/runtime/device/devkernel.hpp | 437 ++++++++++++++++++ .../clr/rocclr/runtime/device/devprogram.cpp | 28 +- .../clr/rocclr/runtime/device/devprogram.hpp | 12 + .../rocclr/runtime/device/pal/palkernel.cpp | 48 ++ .../rocclr/runtime/device/rocm/rockernel.cpp | 97 ++++ 6 files changed, 919 insertions(+), 7 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp index 45f828e9af..dc7b8284e4 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.cpp +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -10,6 +10,7 @@ #include "utils/bif_section_labels.hpp" #include "utils/libUtils.h" +#include #include #include @@ -19,6 +20,11 @@ #include "llvm/Support/AMDGPUMetadata.h" typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; + +using llvm::AMDGPU::HSAMD::AccessQualifier; +using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; +using llvm::AMDGPU::HSAMD::ValueKind; +using llvm::AMDGPU::HSAMD::ValueType; #endif // defined(WITH_LIGHTNING_COMPILER) namespace device { @@ -223,11 +229,6 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, } // ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) -using llvm::AMDGPU::HSAMD::AccessQualifier; -using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; -using llvm::AMDGPU::HSAMD::ValueKind; -using llvm::AMDGPU::HSAMD::ValueType; - static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) { switch (lcArg.mValueKind) { case ValueKind::GlobalBuffer: @@ -769,6 +770,296 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a // ================================================================================================ #if defined(WITH_LIGHTNING_COMPILER) +#if defined(USE_COMGR_LIBRARY) +bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t programMD, + const uint32_t kernargSegmentByteSize, + KernelMD* kernelMD) { + + amd_comgr_metadata_node_t kernelMeta = {0}; + + if (!GetKernelMetadata(programMD, name(), &kernelMeta)) { + if (kernelMeta.handle != 0) { + amd_comgr_destroy_metadata(kernelMeta); + } + return false; + } + + InitParameters(kernelMeta, kernargSegmentByteSize); + + // Set the workgroup information for the kernel + workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; + assert(workGroupInfo_.availableLDSSize_ > 0); + workGroupInfo_.availableSGPRs_ = 104; + workGroupInfo_.availableVGPRs_ = 256; + + // extract the attribute metadata if there is any + amd_comgr_metadata_node_t attrMeta; + amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS; + if (amd_comgr_metadata_lookup(kernelMeta, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_iterate_map_metadata(attrMeta, device::populateAttrs, + static_cast(kernelMD)); + amd_comgr_destroy_metadata(attrMeta); + } + + // extract the code properties metadata + amd_comgr_metadata_node_t codePropsMeta; + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_metadata_lookup(kernelMeta, "CodeProps", &codePropsMeta); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_iterate_map_metadata(codePropsMeta, device::populateCodeProps, + static_cast(kernelMD)); + amd_comgr_destroy_metadata(codePropsMeta); + } + + amd_comgr_destroy_metadata(kernelMeta); + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return false; + } + + // Setup the workgroup info based on the attributes and code properties + if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { + const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; + workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; + workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; + workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; + } + + if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { + const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; + workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; + workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; + workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; + } + + if (!kernelMD->mAttrs.mVecTypeHint.empty()) { + workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str(); + } + + return true; +} + +bool Kernel::GetKernelMetadata(const amd_comgr_metadata_node_t programMD, + const std::string& name, + amd_comgr_metadata_node_t* kernelNode) { + amd_comgr_status_t status; + amd_comgr_metadata_node_t kernelsMD; + amd_comgr_metadata_kind_t kind; + size_t size = 0; + + status = amd_comgr_metadata_lookup(programMD, "Kernels", &kernelsMD); + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_get_metadata_list_size(kernelsMD, &size); + } + + bool kernelFound = false; + for (size_t i = 0; i < size && !kernelFound && status == AMD_COMGR_STATUS_SUCCESS; i++) { + size_t nameSize; + std::string kernelName; + + amd_comgr_metadata_node_t nameMeta; + status = amd_comgr_index_list_metadata(kernelsMD, i, kernelNode); + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_metadata_lookup(*kernelNode, "Name", &nameMeta); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(nameMeta, &kernelName); + } + + if ((status == AMD_COMGR_STATUS_SUCCESS) && (name.compare(kernelName) == 0)) { + kernelFound = true; + } + amd_comgr_destroy_metadata(nameMeta); + } + + amd_comgr_destroy_metadata(kernelsMD); + + return kernelFound; +} + +bool Kernel::SetAvailableSgprVgpr(const std::string& targetIdent) { + std::string buf; + + amd_comgr_metadata_node_t isaMeta; + amd_comgr_metadata_node_t sgprMeta; + amd_comgr_metadata_node_t vgprMeta; + + amd_comgr_status_t status = amd_comgr_get_isa_metadata(targetIdent.c_str(), &isaMeta); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_metadata_lookup(isaMeta, "AddressableNumSGPRs", &sgprMeta); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(sgprMeta, &buf); + } + + workGroupInfo_.availableSGPRs_ = (status == AMD_COMGR_STATUS_SUCCESS) ? atoi(buf.c_str()) : 0; + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_metadata_lookup(isaMeta, "AddressableNumVGPRs", &vgprMeta); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(vgprMeta, &buf); + } + workGroupInfo_.availableVGPRs_ = (status == AMD_COMGR_STATUS_SUCCESS) ? atoi(buf.c_str()) : 0; + + amd_comgr_destroy_metadata(vgprMeta); + amd_comgr_destroy_metadata(sgprMeta); + amd_comgr_destroy_metadata(isaMeta); + + return (status == AMD_COMGR_STATUS_SUCCESS); +} + +bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD, + std::vector* printfStr) { + + amd_comgr_metadata_node_t printfMeta; + amd_comgr_status_t status = amd_comgr_metadata_lookup(programMD, "Printf", &printfMeta); + if (status != AMD_COMGR_STATUS_SUCCESS) { + return true; // printf string metadata is not provided so just exit + } + + // handle the printf string + size_t printfSize = 0; + status = amd_comgr_get_metadata_list_size(printfMeta, &printfSize); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + std::string buf; + for (size_t i = 0; i < printfSize; ++i) { + amd_comgr_metadata_node_t str; + status = amd_comgr_index_list_metadata(printfMeta, i, &str); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(str, &buf); + amd_comgr_destroy_metadata(str); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return false; + } + + printfStr->push_back(buf); + } + } + + amd_comgr_destroy_metadata(printfMeta); + return (status == AMD_COMGR_STATUS_SUCCESS); +} + +void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) { + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + size_t offsetStruct = argBufferSize; + + amd_comgr_metadata_node_t argsMeta; + size_t argsSize; + + amd_comgr_status_t status = amd_comgr_metadata_lookup(kernelMD, "Args", &argsMeta); + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_get_metadata_list_size(argsMeta, &argsSize); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return; + } + + for (size_t i = 0; i < argsSize; ++i) { + KernelArgMD lcArg; + + amd_comgr_metadata_node_t argsNode; + amd_comgr_metadata_kind_t kind; + + status = amd_comgr_index_list_metadata(argsMeta, i, &argsNode); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_get_metadata_kind(argsNode, &kind); + } + if (kind != AMD_COMGR_METADATA_KIND_MAP) { + status = AMD_COMGR_STATUS_ERROR; + } + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_iterate_map_metadata(argsNode, populateArgs, static_cast(&lcArg)); + } + + amd_comgr_destroy_metadata(argsNode); + + if (status != AMD_COMGR_STATUS_SUCCESS) { + amd_comgr_destroy_metadata(argsMeta); + return; + } + + size_t size = GetArgSizeOCL(lcArg); + size_t alignment = GetArgAlignmentOCL(lcArg); + bool isHidden = false; + desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden); + + // Allocate the hidden arguments, but abstraction layer will skip them + if (isHidden) { + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) { + setDynamicParallelFlag(true); + } + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + desc.size_ = size; + offset += size; + hiddenParams.push_back(desc); + continue; + } + + desc.name_ = lcArg.mName.c_str(); + desc.type_ = GetOclTypeOCL(lcArg, size); + desc.typeName_ = lcArg.mTypeName.c_str(); + + desc.addressQualifier_ = GetOclAddrQualOCL(lcArg); + desc.accessQualifier_ = GetOclAccessQualOCL(lcArg); + desc.typeQualifier_ = GetOclTypeQualOCL(lcArg); + desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg); + desc.size_ = size; + + // These objects have forced data size to uint64_t + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } + else { + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + offset += size; + } + + // Update read only flag + desc.info_.readOnly_ = GetReadOnlyOCL(lcArg); + + params.push_back(desc); + + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + + amd_comgr_destroy_metadata(argsMeta); + + // Save the number of OCL arguments + uint32_t numParams = params.size(); + // Append the hidden arguments to the OCL arguments + params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); + createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); +} +#else // not define USE_COMGR_LIBRARY void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { // Iterate through the arguments and insert into parameterList device::Kernel::parameters_t params; @@ -843,7 +1134,8 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); } -#endif +#endif // defined(USE_COMGR_LIBRARY) +#endif // defined(WITH_LIGHTNING_COMPILER) // ================================================================================================ #if defined(WITH_COMPILER_LIB) diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp index 9f7a57436b..3e13441afe 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.hpp +++ b/projects/clr/rocclr/runtime/device/devkernel.hpp @@ -25,6 +25,149 @@ struct RuntimeHandle { uint32_t group_segment_size; //!< From GROUP_SEGMENT_FIXED_SIZE }; +#if defined(USE_COMGR_LIBRARY) +#include "llvm/Support/AMDGPUMetadata.h" +typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; + +using llvm::AMDGPU::HSAMD::AccessQualifier; +using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; +using llvm::AMDGPU::HSAMD::ValueKind; +using llvm::AMDGPU::HSAMD::ValueType; + +enum class ArgField : uint8_t { + Name = 0, + TypeName = 1, + Size = 2, + Align = 3, + ValueKind = 4, + ValueType = 5, + PointeeAlign = 6, + AddrSpaceQual = 7, + AccQual = 8, + ActualAccQual = 9, + IsConst = 10, + IsRestrict = 11, + IsVolatile = 12, + IsPipe = 13 +}; + +enum class AttrField : uint8_t { + ReqWorkGroupSize = 0, + WorkGroupSizeHint = 1, + VecTypeHint = 2, + RuntimeHandle = 3 +}; + +enum class CodePropField : uint8_t { + KernargSegmentSize = 0, + GroupSegmentFixedSize = 1, + PrivateSegmentFixedSize = 2, + KernargSegmentAlign = 3, + WavefrontSize = 4, + NumSGPRs = 5, + NumVGPRs = 6, + MaxFlatWorkGroupSize = 7, + IsDynamicCallStack = 8, + IsXNACKEnabled = 9, + NumSpilledSGPRs = 10, + NumSpilledVGPRs = 11 +}; + + +static const std::map ArgFieldMap = +{ + {"Name", ArgField::Name}, + {"TypeName", ArgField::TypeName}, + {"Size", ArgField::Size}, + {"Align", ArgField::Align}, + {"ValueKind", ArgField::ValueKind}, + {"ValueType", ArgField::ValueType}, + {"PointeeAlign", ArgField::PointeeAlign}, + {"AddrSpaceQual", ArgField::AddrSpaceQual}, + {"AccQual", ArgField::AccQual}, + {"ActualAccQual", ArgField::ActualAccQual}, + {"IsConst", ArgField::IsConst}, + {"IsRestrict", ArgField::IsRestrict}, + {"IsVolatile", ArgField::IsVolatile}, + {"IsPipe", ArgField::IsPipe} +}; + +static const std::map ArgValueKind = +{ + {"ByValue", ValueKind::ByValue}, + {"GlobalBuffer", ValueKind::GlobalBuffer}, + {"DynamicSharedPointer", ValueKind::DynamicSharedPointer}, + {"Sampler", ValueKind::Sampler}, + {"Image", ValueKind::Image}, + {"Pipe", ValueKind::Pipe}, + {"Queue", ValueKind::Queue}, + {"HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX}, + {"HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY}, + {"HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ}, + {"HiddenNone", ValueKind::HiddenNone}, + {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, + {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, + {"HiddenCompletionAction", ValueKind::HiddenCompletionAction} +}; + +static const std::map ArgValueType = +{ + {"Struct", ValueType::Struct}, + {"I8", ValueType::I8}, + {"U8", ValueType::U8}, + {"I16", ValueType::I16}, + {"U16", ValueType::U16}, + {"F16", ValueType::F16}, + {"I32", ValueType::I32}, + {"U32", ValueType::U32}, + {"F32", ValueType::F32}, + {"I64", ValueType::I64}, + {"U64", ValueType::U64}, + {"F64", ValueType::F64} +}; + +static const std::map ArgAccQual = +{ + {"Default", AccessQualifier::Default}, + {"ReadOnly", AccessQualifier::ReadOnly}, + {"WriteOnly", AccessQualifier::WriteOnly}, + {"ReadWrite", AccessQualifier::ReadWrite} +}; + +static const std::map ArgAddrSpaceQual = +{ + {"Private", AddressSpaceQualifier::Private}, + {"Global", AddressSpaceQualifier::Global}, + {"Constant", AddressSpaceQualifier::Constant}, + {"Local", AddressSpaceQualifier::Local}, + {"Generic", AddressSpaceQualifier::Generic}, + {"Region", AddressSpaceQualifier::Region} +}; + +static const std::map AttrFieldMap = +{ + {"ReqWorkGroupSize", AttrField::ReqWorkGroupSize}, + {"WorkGroupSizeHint", AttrField::WorkGroupSizeHint}, + {"VecTypeHint", AttrField::VecTypeHint}, + {"RuntimeHandle", AttrField::RuntimeHandle} +}; + +static const std::map CodePropFieldMap = +{ + {"KernargSegmentSize", CodePropField::KernargSegmentSize}, + {"GroupSegmentFixedSize", CodePropField::GroupSegmentFixedSize}, + {"PrivateSegmentFixedSize", CodePropField::PrivateSegmentFixedSize}, + {"KernargSegmentAlign", CodePropField::KernargSegmentAlign}, + {"WavefrontSize", CodePropField::WavefrontSize}, + {"NumSGPRs", CodePropField::NumSGPRs}, + {"NumVGPRs", CodePropField::NumVGPRs}, + {"MaxFlatWorkGroupSize", CodePropField::MaxFlatWorkGroupSize}, + {"IsDynamicCallStack", CodePropField::IsDynamicCallStack}, + {"IsXNACKEnabled", CodePropField::IsXNACKEnabled}, + {"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs}, + {"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs} +}; +#endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) namespace amd { @@ -228,7 +371,28 @@ class Kernel : public amd::HeapObject { protected: //! Initializes the abstraction layer kernel parameters #if defined(WITH_LIGHTNING_COMPILER) +#if defined(USE_COMGR_LIBRARY) + void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize); + + //! Get ther kernel metadata + bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD, + const std::string& name, + amd_comgr_metadata_node_t* kernelNode); + + //! Retrieve kernel attribute and code properties metadata + bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t programMD, + const uint32_t kernargSegmentByteSize, + KernelMD* kernelMD); + + //! Retrieve the available SGPRs and VGPRs + bool SetAvailableSgprVgpr(const std::string& targetIdent); + + //! Retrieve the printf string metadata + bool GetPrintfStr(const amd_comgr_metadata_node_t programMD, + std::vector* printfStr); +#else void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize); +#endif //! Initializes HSAIL Printf metadata and info for LC void InitPrintf(const std::vector& printfInfoStrings); #endif @@ -270,4 +434,277 @@ class Kernel : public amd::HeapObject { std::unordered_map patchReferences_; //!< Patch table for references }; +#if defined(USE_COMGR_LIBRARY) +static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, + std::string* str) { + size_t size = 0; + amd_comgr_status_t status = amd_comgr_get_metadata_string(meta, &size, NULL); + + if (status == AMD_COMGR_STATUS_SUCCESS) { + str->resize(size-1); // minus one to discount the null character + status = amd_comgr_get_metadata_string(meta, &size, &((*str)[0])); + } + + return status; +} + +static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + std::string buf; + + // get the key of the argument field + size_t size = 0; + status = amd_comgr_get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itArgField = ArgFieldMap.find(buf); + if (itArgField == ArgFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + // get the value of the argument field + status = getMetaBuf(value, &buf); + + KernelArgMD* lcArg = static_cast(data); + + switch (itArgField->second) { + case ArgField::Name: + lcArg->mName = buf; + break; + case ArgField::TypeName: + lcArg->mTypeName = buf; + break; + case ArgField::Size: + lcArg->mSize = atoi(buf.c_str()); + break; + case ArgField::Align: + lcArg->mAlign = atoi(buf.c_str()); + break; + case ArgField::ValueKind: + { + auto itValueKind = ArgValueKind.find(buf); + if (itValueKind == ArgValueKind.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueKind = itValueKind->second; + } + break; + case ArgField::ValueType: + { + auto itValueType = ArgValueType.find(buf); + if (itValueType == ArgValueType.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mValueType = itValueType->second; + } + break; + case ArgField::PointeeAlign: + lcArg->mPointeeAlign = atoi(buf.c_str()); + break; + case ArgField::AddrSpaceQual: + { + auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf); + if (itAddrSpaceQual == ArgAddrSpaceQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAddrSpaceQual = itAddrSpaceQual->second; + } + break; + case ArgField::AccQual: + { + auto itAccQual = ArgAccQual.find(buf); + if (itAccQual == ArgAccQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mAccQual = itAccQual->second; + } + break; + case ArgField::ActualAccQual: + { + auto itAccQual = ArgAccQual.find(buf); + if (itAccQual == ArgAccQual.end()) { + return AMD_COMGR_STATUS_ERROR; + } + lcArg->mActualAccQual = itAccQual->second; + } + break; + case ArgField::IsConst: + lcArg->mIsConst = (buf.compare("true") == 0); + break; + case ArgField::IsRestrict: + lcArg->mIsRestrict = (buf.compare("true") == 0); + break; + case ArgField::IsVolatile: + lcArg->mIsVolatile = (buf.compare("true") == 0); + break; + case ArgField::IsPipe: + lcArg->mIsPipe = (buf.compare("true") == 0); + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + return AMD_COMGR_STATUS_SUCCESS; +} + +static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + size_t size = 0; + std::string buf; + + // get the key of the argument field + status = amd_comgr_get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itAttrField = AttrFieldMap.find(buf); + if (itAttrField == AttrFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + KernelMD* kernelMD = static_cast(data); + switch (itAttrField->second) { + case AttrField::ReqWorkGroupSize: + { + status = amd_comgr_get_metadata_list_size(value, &size); + if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSize; + status = amd_comgr_index_list_metadata(value, i, &workgroupSize); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str())); + } + amd_comgr_destroy_metadata(workgroupSize); + } + } + } + break; + case AttrField::WorkGroupSizeHint: + { + status = amd_comgr_get_metadata_list_size(value, &size); + if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) { + for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { + amd_comgr_metadata_node_t workgroupSizeHint; + status = amd_comgr_index_list_metadata(value, i, &workgroupSizeHint); + + if (status == AMD_COMGR_STATUS_SUCCESS && + getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str())); + } + amd_comgr_destroy_metadata(workgroupSizeHint); + } + } + } + break; + case AttrField::VecTypeHint: + { + if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mVecTypeHint = buf; + } + } + break; + case AttrField::RuntimeHandle: + { + if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { + kernelMD->mAttrs.mRuntimeHandle = buf; + } + } + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + + return status; +} + +static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key, + const amd_comgr_metadata_node_t value, + void *data) { + amd_comgr_status_t status; + amd_comgr_metadata_kind_t kind; + std::string buf; + + // get the key of the argument field + status = amd_comgr_get_metadata_kind(key, &kind); + if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(key, &buf); + } + + if (status != AMD_COMGR_STATUS_SUCCESS) { + return AMD_COMGR_STATUS_ERROR; + } + + auto itCodePropField = CodePropFieldMap.find(buf); + if (itCodePropField == CodePropFieldMap.end()) { + return AMD_COMGR_STATUS_ERROR; + } + + // get the value of the argument field + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = getMetaBuf(value, &buf); + } + + KernelMD* kernelMD = static_cast(data); + switch (itCodePropField->second) { + case CodePropField::KernargSegmentSize: + kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); + break; + case CodePropField::GroupSegmentFixedSize: + kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); + break; + case CodePropField::PrivateSegmentFixedSize: + kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str()); + break; + case CodePropField::KernargSegmentAlign: + kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str()); + break; + case CodePropField::WavefrontSize: + kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str()); + break; + case CodePropField::NumSGPRs: + kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str()); + break; + case CodePropField::NumVGPRs: + kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str()); + break; + case CodePropField::MaxFlatWorkGroupSize: + kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str()); + break; + case CodePropField::IsDynamicCallStack: + kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0); + break; + case CodePropField::IsXNACKEnabled: + kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0); + break; + case CodePropField::NumSpilledSGPRs: + kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str()); + break; + case CodePropField::NumSpilledVGPRs: + kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str()); + break; + default: + return AMD_COMGR_STATUS_ERROR; + } + return AMD_COMGR_STATUS_SUCCESS; +} +#endif + } // namespace device diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp index 28d06e0fff..913668eeed 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.cpp +++ b/projects/clr/rocclr/runtime/device/devprogram.cpp @@ -197,7 +197,7 @@ void Program::extractByteCodeBinary(const amd_comgr_data_set_t inDataSet, status = amd_comgr_action_data_get_data(inDataSet, dataKind, 0, &binaryData); } - size_t binarySize; + size_t binarySize = 0; if (status == AMD_COMGR_STATUS_SUCCESS) { status = amd_comgr_get_data(binaryData, &binarySize, NULL); } @@ -1224,6 +1224,9 @@ bool Program::linkImplLC(amd::option::Options* options) { acl_error errorCode; aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; + internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? + true : false; + amd_comgr_data_set_t inputs; if (amd_comgr_create_data_set(&inputs) != AMD_COMGR_STATUS_SUCCESS) { buildLog_ += "Error: COMGR fails to create data set for linking.\n"; @@ -2672,6 +2675,28 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) { else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ && note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) { +#if defined(USE_COMGR_LIBRARY) + amd_comgr_status_t status; + amd_comgr_data_t binaryData; + + status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &binaryData); + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd_comgr_set_data(binaryData, binSize, + reinterpret_cast(binary)); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + metadata_ = new amd_comgr_metadata_node_t; + status = amd_comgr_get_data_metadata(binaryData, metadata_); + } + + amd_comgr_release_data(binaryData); + + if (status != AMD_COMGR_STATUS_SUCCESS) { + buildLog_ += "Error: COMGR fails to get the metadata.\n"; + return false; + } +#else std::string metadataStr((const char*)desc, (size_t)note->n_descsz); metadata_ = new CodeObjectMD(); if (llvm::AMDGPU::HSAMD::fromString(metadataStr, *metadata_)) { @@ -2680,6 +2705,7 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) { } // We've found and loaded the runtime metadata, exit the // note record loop now. +#endif break; } ptr += sizeof(*note) + amd::alignUp(note->n_namesz, sizeof(int)) + diff --git a/projects/clr/rocclr/runtime/device/devprogram.hpp b/projects/clr/rocclr/runtime/device/devprogram.hpp index e8d64d7cf3..39559c233b 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.hpp +++ b/projects/clr/rocclr/runtime/device/devprogram.hpp @@ -114,7 +114,12 @@ class Program : public amd::HeapObject { size_t globalVariableTotalSize_; amd::option::Options* programOptions_; + +#if defined(USE_COMGR_LIBRARY) + amd_comgr_metadata_node_t* metadata_; //!< COMgr metadata +#else CodeObjectMD* metadata_; //!< Runtime metadata +#endif public: //! Construct a section. @@ -196,7 +201,14 @@ class Program : public amd::HeapObject { //! Global variables are a part of the code segment bool hasGlobalStores() const { return hasGlobalStores_; } +#if defined(USE_COMGR_LIBRARY) + const amd_comgr_metadata_node_t* metadata() const { return metadata_; } +#else const CodeObjectMD* metadata() const { return metadata_; } +#endif + + //! Get the machine target for the program + const char* machineTarget() const { return machineTarget_; } protected: //! pre-compile setup diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index a679a9191e..c593b98c99 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -395,6 +395,53 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { aqlCreateHWInfo(symbol); +#if defined(USE_COMGR_LIBRARY) + const amd_comgr_metadata_node_t* programMD = prog().metadata(); + assert(programMD != nullptr); + + KernelMD kernelMD; + if (!GetAttrCodePropMetadata(*programMD, argsBufferSize(), &kernelMD)) { + return false; + } + + if (!kernelMD.mAttrs.mRuntimeHandle.empty()) { + hsa_agent_t agent; + agent.handle = 1; + amd::hsa::loader::Symbol* rth_symbol; + + // Get the runtime handle symbol GPU address + rth_symbol = prog_.GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()), + const_cast(&agent)); + uint64_t symbol_address; + rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); + + // Copy the kernel_object pointer to the runtime handle symbol GPU address + const Memory& codeSegGpu = prog_.codeSegGpu(); + uint64_t offset = symbol_address - codeSegGpu.vmAddress(); + uint64_t kernel_object = gpuAqlCode(); + VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); + + codeSegGpu.writeRawData(*gpu, offset, 8, &kernel_object, true); + } + + // Copy wavefront size + workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; + + workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize; + if (workGroupInfo_.size_ == 0) { + return false; + } + + // handle the printf metadata if any + std::vector printfStr; + if (!GetPrintfStr(*programMD, &printfStr)) { + return false; + } + + if (!printfStr.empty()) { + InitPrintf(printfStr); + } +#else const CodeObjectMD* programMD = prog().metadata(); assert(programMD != nullptr); @@ -471,6 +518,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { waveLimiter_.enable(); */ +#endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) return true; } diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index 04f2c18f02..514ec20913 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -32,6 +32,102 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle kernargSegmentAlignment_(kernargSegmentAlignment) {} #if defined(WITH_LIGHTNING_COMPILER) +#if defined(USE_COMGR_LIBRARY) +bool LightningKernel::init() { + + hsa_agent_t hsaDevice = program_->hsaDevice(); + + const amd_comgr_metadata_node_t* programMD = static_cast(program_)->metadata(); + assert(programMD != nullptr); + + KernelMD kernelMD; + if (!GetAttrCodePropMetadata(*programMD, KernargSegmentByteSize(), &kernelMD)) { + return false; + } + + // Set the workgroup information for the kernel + workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; + assert(workGroupInfo_.availableLDSSize_ > 0); + + // Get the available SGPRs and VGPRs + const std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget(); + if (!SetAvailableSgprVgpr(targetIdent)) { + return false; + } + + if (!kernelMD.mAttrs.mRuntimeHandle.empty()) { + hsa_agent_t agent = program_->hsaDevice(); + hsa_executable_symbol_t kernelSymbol; + hsa_status_t hsaStatus; + int variable_size; + uint64_t variable_address; + + // Only kernels that could be enqueued by another kernel has the RuntimeHandle metadata. The RuntimeHandle + // metadata is a string that represents a variable from which the library code can retrieve the kernel code + // object handle of such a kernel. The address of the variable and the kernel code object handle are known + // only after the hsa executable is loaded. The below code copies the kernel code object handle to the + // address of the variable. + hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), + kernelMD.mAttrs.mRuntimeHandle.c_str(), + &agent, &kernelSymbol); + if (hsaStatus == HSA_STATUS_SUCCESS) { + hsaStatus = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, + &variable_size); + } + if (hsaStatus == HSA_STATUS_SUCCESS) { + hsaStatus = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, + &variable_size); + } + if (hsaStatus == HSA_STATUS_SUCCESS) { + hsaStatus = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, + &variable_address); + } + if (hsaStatus == HSA_STATUS_SUCCESS) { + hsaStatus = hsa_memory_copy(reinterpret_cast(variable_address), + &kernelCodeHandle_, variable_size); + } + + if (hsaStatus != HSA_STATUS_SUCCESS) { + return false; + } + } + + uint32_t wavefront_size = 0; + if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != + HSA_STATUS_SUCCESS) { + return false; + } + assert(wavefront_size > 0); + + workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; + workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; + workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; + workGroupInfo_.preferredSizeMultiple_ = wavefront_size; + workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs; + workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontSize_ = wavefront_size; + workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize; + if (workGroupInfo_.size_ == 0) { + return false; + } + + // handle the printf metadata if any + std::vector printfStr; + if (!GetPrintfStr(*programMD, &printfStr)) { + return false; + } + + if (!printfStr.empty()) { + InitPrintf(printfStr); + } + return true; +} +#else static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) { for (const KernelMD& kernelMD : programMD->mKernels) { if (kernelMD.mName == name) { @@ -154,6 +250,7 @@ bool LightningKernel::init() { return true; } +#endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) #if defined(WITH_COMPILER_LIB)