// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #pragma once #include "include/aclTypes.h" #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" #include "devwavelimiter.hpp" #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) namespace llvm { namespace AMDGPU { namespace HSAMD { namespace Kernel { struct Metadata; }}}} typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; //! Runtime handle structure for device enqueue struct RuntimeHandle { uint64_t kernel_handle; //!< Pointer to amd_kernel_code_s or kernel_descriptor_t uint32_t private_segment_size; //!< From PRIVATE_SEGMENT_FIXED_SIZE uint32_t group_segment_size; //!< From GROUP_SEGMENT_FIXED_SIZE }; #if defined(USE_COMGR_LIBRARY) #include "llvm/Support/AMDGPUMetadata.h" typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; using llvm::AMDGPU::HSAMD::AccessQualifier; using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; using llvm::AMDGPU::HSAMD::ValueKind; using llvm::AMDGPU::HSAMD::ValueType; enum class ArgField : uint8_t { Name = 0, TypeName = 1, Size = 2, Align = 3, ValueKind = 4, ValueType = 5, PointeeAlign = 6, AddrSpaceQual = 7, AccQual = 8, ActualAccQual = 9, IsConst = 10, IsRestrict = 11, IsVolatile = 12, IsPipe = 13 }; enum class AttrField : uint8_t { ReqdWorkGroupSize = 0, WorkGroupSizeHint = 1, VecTypeHint = 2, RuntimeHandle = 3 }; enum class CodePropField : uint8_t { KernargSegmentSize = 0, GroupSegmentFixedSize = 1, PrivateSegmentFixedSize = 2, KernargSegmentAlign = 3, WavefrontSize = 4, NumSGPRs = 5, NumVGPRs = 6, MaxFlatWorkGroupSize = 7, IsDynamicCallStack = 8, IsXNACKEnabled = 9, NumSpilledSGPRs = 10, NumSpilledVGPRs = 11 }; static const std::map ArgFieldMap = { {"Name", ArgField::Name}, {"TypeName", ArgField::TypeName}, {"Size", ArgField::Size}, {"Align", ArgField::Align}, {"ValueKind", ArgField::ValueKind}, {"ValueType", ArgField::ValueType}, {"PointeeAlign", ArgField::PointeeAlign}, {"AddrSpaceQual", ArgField::AddrSpaceQual}, {"AccQual", ArgField::AccQual}, {"ActualAccQual", ArgField::ActualAccQual}, {"IsConst", ArgField::IsConst}, {"IsRestrict", ArgField::IsRestrict}, {"IsVolatile", ArgField::IsVolatile}, {"IsPipe", ArgField::IsPipe} }; static const std::map ArgValueKind = { {"ByValue", ValueKind::ByValue}, {"GlobalBuffer", ValueKind::GlobalBuffer}, {"DynamicSharedPointer", ValueKind::DynamicSharedPointer}, {"Sampler", ValueKind::Sampler}, {"Image", ValueKind::Image}, {"Pipe", ValueKind::Pipe}, {"Queue", ValueKind::Queue}, {"HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX}, {"HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY}, {"HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ}, {"HiddenNone", ValueKind::HiddenNone}, {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, {"HiddenCompletionAction", ValueKind::HiddenCompletionAction} }; static const std::map ArgValueType = { {"Struct", ValueType::Struct}, {"I8", ValueType::I8}, {"U8", ValueType::U8}, {"I16", ValueType::I16}, {"U16", ValueType::U16}, {"F16", ValueType::F16}, {"I32", ValueType::I32}, {"U32", ValueType::U32}, {"F32", ValueType::F32}, {"I64", ValueType::I64}, {"U64", ValueType::U64}, {"F64", ValueType::F64} }; static const std::map ArgAccQual = { {"Default", AccessQualifier::Default}, {"ReadOnly", AccessQualifier::ReadOnly}, {"WriteOnly", AccessQualifier::WriteOnly}, {"ReadWrite", AccessQualifier::ReadWrite} }; static const std::map ArgAddrSpaceQual = { {"Private", AddressSpaceQualifier::Private}, {"Global", AddressSpaceQualifier::Global}, {"Constant", AddressSpaceQualifier::Constant}, {"Local", AddressSpaceQualifier::Local}, {"Generic", AddressSpaceQualifier::Generic}, {"Region", AddressSpaceQualifier::Region} }; static const std::map AttrFieldMap = { {"ReqdWorkGroupSize", AttrField::ReqdWorkGroupSize}, {"WorkGroupSizeHint", AttrField::WorkGroupSizeHint}, {"VecTypeHint", AttrField::VecTypeHint}, {"RuntimeHandle", AttrField::RuntimeHandle} }; static const std::map CodePropFieldMap = { {"KernargSegmentSize", CodePropField::KernargSegmentSize}, {"GroupSegmentFixedSize", CodePropField::GroupSegmentFixedSize}, {"PrivateSegmentFixedSize", CodePropField::PrivateSegmentFixedSize}, {"KernargSegmentAlign", CodePropField::KernargSegmentAlign}, {"WavefrontSize", CodePropField::WavefrontSize}, {"NumSGPRs", CodePropField::NumSGPRs}, {"NumVGPRs", CodePropField::NumVGPRs}, {"MaxFlatWorkGroupSize", CodePropField::MaxFlatWorkGroupSize}, {"IsDynamicCallStack", CodePropField::IsDynamicCallStack}, {"IsXNACKEnabled", CodePropField::IsXNACKEnabled}, {"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs}, {"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs} }; #endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) namespace amd { namespace hsa { namespace loader { class Symbol; } // loader namespace code { namespace Kernel { class Metadata; } // Kernel } // code } // hsa } // amd namespace amd { class Device; class KernelSignature; class NDRange; struct KernelParameterDescriptor { enum { Value = 0, HiddenNone = 1, HiddenGlobalOffsetX = 2, HiddenGlobalOffsetY = 3, HiddenGlobalOffsetZ = 4, HiddenPrintfBuffer = 5, HiddenDefaultQueue = 6, HiddenCompletionAction = 7, MemoryObject = 8, ReferenceObject = 9, ValueObject = 10, ImageObject = 11, SamplerObject = 12, QueueObject = 13 }; clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack size_t size_; //!< Its size in bytes union InfoData { struct { uint32_t oclObject_ : 4; //!< OCL object type uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA uint32_t defined_ : 1; //!< The argument was defined by the app uint32_t reserved_ : 1; //!< reserved uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment }; uint32_t allValues_; InfoData() : allValues_(0) {} } info_; cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier std::string name_; //!< The parameter's name in the source std::string typeName_; //!< Argument's type name }; } namespace device { //! Printf info structure struct PrintfInfo { std::string fmtString_; //!< formated string for printf std::vector arguments_; //!< passed arguments to the printf() call }; //! \class DeviceKernel, which will contain the common fields for any device class Kernel : public amd::HeapObject { public: typedef std::vector parameters_t; //! \struct The device kernel workgroup info structure struct WorkGroupInfo : public amd::EmbeddedObject { size_t size_; //!< kernel workgroup size size_t compileSize_[3]; //!< kernel compiled workgroup size cl_ulong localMemSize_; //!< amount of used local memory size_t preferredSizeMultiple_; //!< preferred multiple for launch cl_ulong privateMemSize_; //!< amount of used private memory size_t scratchRegs_; //!< amount of used scratch registers size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD size_t wavefrontSize_; //!< number of threads per wavefront size_t availableGPRs_; //!< GPRs available to the program size_t usedGPRs_; //!< GPRs used by the program size_t availableSGPRs_; //!< SGPRs available to the program size_t usedSGPRs_; //!< SGPRs used by the program size_t availableVGPRs_; //!< VGPRs available to the program size_t usedVGPRs_; //!< VGPRs used by the program size_t availableLDSSize_; //!< available LDS size size_t usedLDSSize_; //!< used LDS size size_t availableStackSize_; //!< available stack size size_t usedStackSize_; //!< used stack size size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint std::string compileVecTypeHint_; //!< kernel compiled vector type hint bool uniformWorkGroupSize_; //!< uniform work group size option size_t wavesPerSimdHint_; //!< waves per simd hit }; //! Default constructor Kernel(const amd::Device& dev, const std::string& name); //! Default destructor virtual ~Kernel(); //! Returns the kernel info structure const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } //! Returns the kernel signature const amd::KernelSignature& signature() const { return *signature_; } //! Returns the kernel name const std::string& name() const { return name_; } //! Initializes the kernel parameters for the abstraction layer bool createSignature( const parameters_t& params, uint32_t numParameters, uint32_t version); void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; } bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; } void setReqdWorkGroupSize(size_t x, size_t y, size_t z) { workGroupInfo_.compileSize_[0] = x; workGroupInfo_.compileSize_[1] = y; workGroupInfo_.compileSize_[2] = z; } size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } void setWorkGroupSizeHint(size_t x, size_t y, size_t z) { workGroupInfo_.compileSizeHint_[0] = x; workGroupInfo_.compileSizeHint_[1] = y; workGroupInfo_.compileSizeHint_[2] = z; } size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } //! Get profiling callback object amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { return waveLimiter_.getProfilingCallback(vdev); }; //! Get waves per shader array to be used for kernel execution. uint getWavesPerSH(const device::VirtualDevice* vdev) const { return waveLimiter_.getWavesPerSH(vdev); }; //! Returns GPU device object, associated with this kernel const amd::Device& dev() const { return dev_; } void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; } //! Return the build log const std::string& buildLog() const { return buildLog_; } static std::string openclMangledName(const std::string& name); const std::unordered_map& patch() const { return patchReferences_; } //! Returns TRUE if kernel uses dynamic parallelism bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } //! set dynamic parallelism flag void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; } //! Returns TRUE if kernel is internal kernel bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } //! set internal kernel flag void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } //! Return TRUE if kernel uses images bool imageEnable() const { return (flags_.imageEna_) ? true : false; } //! Return TRUE if kernel wirtes images bool imageWrite() const { return (flags_.imageWriteEna_) ? true : false; } //! Returns TRUE if it's a HSA kernel bool hsa() const { return (flags_.hsa_) ? true : false; } //! Return printf info array const std::vector& printfInfo() const { return printf_; } //! Finds local workgroup size void FindLocalWorkSize( size_t workDim, //!< Work dimension const amd::NDRange& gblWorkSize, //!< Global work size amd::NDRange& lclWorkSize //!< Calculated local work size ) const; protected: //! Initializes the abstraction layer kernel parameters #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize); //! Get ther kernel metadata bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD, const std::string& name, amd_comgr_metadata_node_t* kernelNode); //! Retrieve kernel attribute and code properties metadata bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode, const uint32_t kernargSegmentByteSize, KernelMD* kernelMD); //! Retrieve the available SGPRs and VGPRs bool SetAvailableSgprVgpr(const std::string& targetIdent); //! Retrieve the printf string metadata bool GetPrintfStr(const amd_comgr_metadata_node_t programMD, std::vector* printfStr); #else void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize); #endif //! Initializes HSAIL Printf metadata and info for LC void InitPrintf(const std::vector& printfInfoStrings); #endif #if defined(WITH_COMPILER_LIB) void InitParameters( const aclArgData* aclArg, //!< List of ACL arguments uint32_t argBufferSize ); //! Initializes HSAIL Printf metadata and info void InitPrintf(const aclPrintfFmt* aclPrintf); #endif const amd::Device& dev_; //!< GPU device object std::string name_; //!< kernel name WorkGroupInfo workGroupInfo_; //!< device kernel info structure amd::KernelSignature* signature_; //!< kernel signature std::string buildLog_; //!< build log std::vector printf_; //!< Format strings for GPU printf support WaveLimiterManager waveLimiter_; //!< adaptively control number of waves union Flags { struct { uint imageEna_ : 1; //!< Kernel uses images uint imageWriteEna_ : 1; //!< Kernel uses image writes uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled uint internalKernel_ : 1; //!< True: internal kernel uint hsa_ : 1; //!< HSA kernel }; uint value_; Flags() : value_(0) {} } flags_; private: //! Disable default copy constructor Kernel(const Kernel&); //! Disable operator= Kernel& operator=(const Kernel&); std::unordered_map patchReferences_; //!< Patch table for references }; #if defined(USE_COMGR_LIBRARY) static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, std::string* str) { size_t size = 0; amd_comgr_status_t status = amd::Comgr::get_metadata_string(meta, &size, NULL); if (status == AMD_COMGR_STATUS_SUCCESS) { str->resize(size-1); // minus one to discount the null character status = amd::Comgr::get_metadata_string(meta, &size, &((*str)[0])); } return status; } static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key, const amd_comgr_metadata_node_t value, void *data) { amd_comgr_status_t status; amd_comgr_metadata_kind_t kind; std::string buf; // get the key of the argument field size_t size = 0; status = amd::Comgr::get_metadata_kind(key, &kind); if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { status = getMetaBuf(key, &buf); } if (status != AMD_COMGR_STATUS_SUCCESS) { return AMD_COMGR_STATUS_ERROR; } auto itArgField = ArgFieldMap.find(buf); if (itArgField == ArgFieldMap.end()) { return AMD_COMGR_STATUS_ERROR; } // get the value of the argument field status = getMetaBuf(value, &buf); KernelArgMD* lcArg = static_cast(data); switch (itArgField->second) { case ArgField::Name: lcArg->mName = buf; break; case ArgField::TypeName: lcArg->mTypeName = buf; break; case ArgField::Size: lcArg->mSize = atoi(buf.c_str()); break; case ArgField::Align: lcArg->mAlign = atoi(buf.c_str()); break; case ArgField::ValueKind: { auto itValueKind = ArgValueKind.find(buf); if (itValueKind == ArgValueKind.end()) { return AMD_COMGR_STATUS_ERROR; } lcArg->mValueKind = itValueKind->second; } break; case ArgField::ValueType: { auto itValueType = ArgValueType.find(buf); if (itValueType == ArgValueType.end()) { return AMD_COMGR_STATUS_ERROR; } lcArg->mValueType = itValueType->second; } break; case ArgField::PointeeAlign: lcArg->mPointeeAlign = atoi(buf.c_str()); break; case ArgField::AddrSpaceQual: { auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf); if (itAddrSpaceQual == ArgAddrSpaceQual.end()) { return AMD_COMGR_STATUS_ERROR; } lcArg->mAddrSpaceQual = itAddrSpaceQual->second; } break; case ArgField::AccQual: { auto itAccQual = ArgAccQual.find(buf); if (itAccQual == ArgAccQual.end()) { return AMD_COMGR_STATUS_ERROR; } lcArg->mAccQual = itAccQual->second; } break; case ArgField::ActualAccQual: { auto itAccQual = ArgAccQual.find(buf); if (itAccQual == ArgAccQual.end()) { return AMD_COMGR_STATUS_ERROR; } lcArg->mActualAccQual = itAccQual->second; } break; case ArgField::IsConst: lcArg->mIsConst = (buf.compare("true") == 0); break; case ArgField::IsRestrict: lcArg->mIsRestrict = (buf.compare("true") == 0); break; case ArgField::IsVolatile: lcArg->mIsVolatile = (buf.compare("true") == 0); break; case ArgField::IsPipe: lcArg->mIsPipe = (buf.compare("true") == 0); break; default: return AMD_COMGR_STATUS_ERROR; } return AMD_COMGR_STATUS_SUCCESS; } static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key, const amd_comgr_metadata_node_t value, void *data) { amd_comgr_status_t status; amd_comgr_metadata_kind_t kind; size_t size = 0; std::string buf; // get the key of the argument field status = amd::Comgr::get_metadata_kind(key, &kind); if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { status = getMetaBuf(key, &buf); } if (status != AMD_COMGR_STATUS_SUCCESS) { return AMD_COMGR_STATUS_ERROR; } auto itAttrField = AttrFieldMap.find(buf); if (itAttrField == AttrFieldMap.end()) { return AMD_COMGR_STATUS_ERROR; } KernelMD* kernelMD = static_cast(data); switch (itAttrField->second) { case AttrField::ReqdWorkGroupSize: { status = amd::Comgr::get_metadata_list_size(value, &size); if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) { for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { amd_comgr_metadata_node_t workgroupSize; status = amd::Comgr::index_list_metadata(value, i, &workgroupSize); if (status == AMD_COMGR_STATUS_SUCCESS && getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) { kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str())); } amd::Comgr::destroy_metadata(workgroupSize); } } } break; case AttrField::WorkGroupSizeHint: { status = amd::Comgr::get_metadata_list_size(value, &size); if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) { for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) { amd_comgr_metadata_node_t workgroupSizeHint; status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint); if (status == AMD_COMGR_STATUS_SUCCESS && getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) { kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str())); } amd::Comgr::destroy_metadata(workgroupSizeHint); } } } break; case AttrField::VecTypeHint: { if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { kernelMD->mAttrs.mVecTypeHint = buf; } } break; case AttrField::RuntimeHandle: { if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) { kernelMD->mAttrs.mRuntimeHandle = buf; } } break; default: return AMD_COMGR_STATUS_ERROR; } return status; } static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key, const amd_comgr_metadata_node_t value, void *data) { amd_comgr_status_t status; amd_comgr_metadata_kind_t kind; std::string buf; // get the key of the argument field status = amd::Comgr::get_metadata_kind(key, &kind); if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) { status = getMetaBuf(key, &buf); } if (status != AMD_COMGR_STATUS_SUCCESS) { return AMD_COMGR_STATUS_ERROR; } auto itCodePropField = CodePropFieldMap.find(buf); if (itCodePropField == CodePropFieldMap.end()) { return AMD_COMGR_STATUS_ERROR; } // get the value of the argument field if (status == AMD_COMGR_STATUS_SUCCESS) { status = getMetaBuf(value, &buf); } KernelMD* kernelMD = static_cast(data); switch (itCodePropField->second) { case CodePropField::KernargSegmentSize: kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); break; case CodePropField::GroupSegmentFixedSize: kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str()); break; case CodePropField::PrivateSegmentFixedSize: kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str()); break; case CodePropField::KernargSegmentAlign: kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str()); break; case CodePropField::WavefrontSize: kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str()); break; case CodePropField::NumSGPRs: kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str()); break; case CodePropField::NumVGPRs: kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str()); break; case CodePropField::MaxFlatWorkGroupSize: kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str()); break; case CodePropField::IsDynamicCallStack: kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0); break; case CodePropField::IsXNACKEnabled: kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0); break; case CodePropField::NumSpilledSGPRs: kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str()); break; case CodePropField::NumSpilledVGPRs: kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str()); break; default: return AMD_COMGR_STATUS_ERROR; } return AMD_COMGR_STATUS_SUCCESS; } #endif } // namespace device