// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #pragma once #include "include/aclTypes.h" #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" #include "devwavelimiter.hpp" #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) namespace llvm { namespace AMDGPU { namespace HSAMD { namespace Kernel { struct Metadata; }}}} typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; //! Runtime handle structure for device enqueue struct RuntimeHandle { uint64_t kernel_handle; //!< Pointer to amd_kernel_code_s or kernel_descriptor_t uint32_t private_segment_size; //!< From PRIVATE_SEGMENT_FIXED_SIZE uint32_t group_segment_size; //!< From GROUP_SEGMENT_FIXED_SIZE }; #if defined(USE_COMGR_LIBRARY) #include "llvm/Support/AMDGPUMetadata.h" typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; using llvm::AMDGPU::HSAMD::AccessQualifier; using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; using llvm::AMDGPU::HSAMD::ValueKind; using llvm::AMDGPU::HSAMD::ValueType; // for Code Object V3 enum class ArgField : uint8_t { Name = 0, TypeName = 1, Size = 2, Align = 3, ValueKind = 4, ValueType = 5, PointeeAlign = 6, AddrSpaceQual = 7, AccQual = 8, ActualAccQual = 9, IsConst = 10, IsRestrict = 11, IsVolatile = 12, IsPipe = 13, Offset = 14 }; enum class AttrField : uint8_t { ReqdWorkGroupSize = 0, WorkGroupSizeHint = 1, VecTypeHint = 2, RuntimeHandle = 3 }; enum class CodePropField : uint8_t { KernargSegmentSize = 0, GroupSegmentFixedSize = 1, PrivateSegmentFixedSize = 2, KernargSegmentAlign = 3, WavefrontSize = 4, NumSGPRs = 5, NumVGPRs = 6, MaxFlatWorkGroupSize = 7, IsDynamicCallStack = 8, IsXNACKEnabled = 9, NumSpilledSGPRs = 10, NumSpilledVGPRs = 11 }; static const std::map ArgFieldMap = { {"Name", ArgField::Name}, {"TypeName", ArgField::TypeName}, {"Size", ArgField::Size}, {"Align", ArgField::Align}, {"ValueKind", ArgField::ValueKind}, {"ValueType", ArgField::ValueType}, {"PointeeAlign", ArgField::PointeeAlign}, {"AddrSpaceQual", ArgField::AddrSpaceQual}, {"AccQual", ArgField::AccQual}, {"ActualAccQual", ArgField::ActualAccQual}, {"IsConst", ArgField::IsConst}, {"IsRestrict", ArgField::IsRestrict}, {"IsVolatile", ArgField::IsVolatile}, {"IsPipe", ArgField::IsPipe} }; static const std::map ArgValueKind = { {"ByValue", ValueKind::ByValue}, {"GlobalBuffer", ValueKind::GlobalBuffer}, {"DynamicSharedPointer", ValueKind::DynamicSharedPointer}, {"Sampler", ValueKind::Sampler}, {"Image", ValueKind::Image}, {"Pipe", ValueKind::Pipe}, {"Queue", ValueKind::Queue}, {"HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX}, {"HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY}, {"HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ}, {"HiddenNone", ValueKind::HiddenNone}, {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, {"HiddenCompletionAction", ValueKind::HiddenCompletionAction} }; static const std::map ArgValueType = { {"Struct", ValueType::Struct}, {"I8", ValueType::I8}, {"U8", ValueType::U8}, {"I16", ValueType::I16}, {"U16", ValueType::U16}, {"F16", ValueType::F16}, {"I32", ValueType::I32}, {"U32", ValueType::U32}, {"F32", ValueType::F32}, {"I64", ValueType::I64}, {"U64", ValueType::U64}, {"F64", ValueType::F64} }; static const std::map ArgAccQual = { {"Default", AccessQualifier::Default}, {"ReadOnly", AccessQualifier::ReadOnly}, {"WriteOnly", AccessQualifier::WriteOnly}, {"ReadWrite", AccessQualifier::ReadWrite} }; static const std::map ArgAddrSpaceQual = { {"Private", AddressSpaceQualifier::Private}, {"Global", AddressSpaceQualifier::Global}, {"Constant", AddressSpaceQualifier::Constant}, {"Local", AddressSpaceQualifier::Local}, {"Generic", AddressSpaceQualifier::Generic}, {"Region", AddressSpaceQualifier::Region} }; static const std::map AttrFieldMap = { {"ReqdWorkGroupSize", AttrField::ReqdWorkGroupSize}, {"WorkGroupSizeHint", AttrField::WorkGroupSizeHint}, {"VecTypeHint", AttrField::VecTypeHint}, {"RuntimeHandle", AttrField::RuntimeHandle} }; static const std::map CodePropFieldMap = { {"KernargSegmentSize", CodePropField::KernargSegmentSize}, {"GroupSegmentFixedSize", CodePropField::GroupSegmentFixedSize}, {"PrivateSegmentFixedSize", CodePropField::PrivateSegmentFixedSize}, {"KernargSegmentAlign", CodePropField::KernargSegmentAlign}, {"WavefrontSize", CodePropField::WavefrontSize}, {"NumSGPRs", CodePropField::NumSGPRs}, {"NumVGPRs", CodePropField::NumVGPRs}, {"MaxFlatWorkGroupSize", CodePropField::MaxFlatWorkGroupSize}, {"IsDynamicCallStack", CodePropField::IsDynamicCallStack}, {"IsXNACKEnabled", CodePropField::IsXNACKEnabled}, {"NumSpilledSGPRs", CodePropField::NumSpilledSGPRs}, {"NumSpilledVGPRs", CodePropField::NumSpilledVGPRs} }; // for Code Object V3 enum class KernelField : uint8_t { SymbolName = 0, ReqdWorkGroupSize = 1, WorkGroupSizeHint = 2, VecTypeHint = 3, DeviceEnqueueSymbol = 4, KernargSegmentSize = 5, GroupSegmentFixedSize = 6, PrivateSegmentFixedSize = 7, KernargSegmentAlign = 8, WavefrontSize = 9, NumSGPRs = 10, NumVGPRs = 11, MaxFlatWorkGroupSize = 12, NumSpilledSGPRs = 13, NumSpilledVGPRs = 14 }; static const std::map ArgFieldMapV3 = { {".name", ArgField::Name}, {".type_name", ArgField::TypeName}, {".size", ArgField::Size}, {".offset", ArgField::Offset}, {".value_kind", ArgField::ValueKind}, {".value_type", ArgField::ValueType}, {".pointee_align", ArgField::PointeeAlign}, {".address_space", ArgField::AddrSpaceQual}, {".access", ArgField::AccQual}, {".actual_access", ArgField::ActualAccQual}, {".is_const", ArgField::IsConst}, {".is_restrict", ArgField::IsRestrict}, {".is_volatile", ArgField::IsVolatile}, {".is_pipe", ArgField::IsPipe} }; static const std::map ArgValueKindV3 = { {"by_value", ValueKind::ByValue}, {"global_buffer", ValueKind::GlobalBuffer}, {"dynamic_shared_pointer", ValueKind::DynamicSharedPointer}, {"sampler", ValueKind::Sampler}, {"image", ValueKind::Image}, {"pipe", ValueKind::Pipe}, {"queue", ValueKind::Queue}, {"hidden_global_offset_x", ValueKind::HiddenGlobalOffsetX}, {"hidden_global_offset_y", ValueKind::HiddenGlobalOffsetY}, {"hidden_global_offset_z", ValueKind::HiddenGlobalOffsetZ}, {"hidden_none", ValueKind::HiddenNone}, {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer}, {"hidden_default_queue", ValueKind::HiddenDefaultQueue}, {"hidden_completion_action", ValueKind::HiddenCompletionAction} }; static const std::map ArgValueTypeV3 = { {"struct", ValueType::Struct}, {"i8", ValueType::I8}, {"u8", ValueType::U8}, {"i16", ValueType::I16}, {"u16", ValueType::U16}, {"f16", ValueType::F16}, {"i32", ValueType::I32}, {"u32", ValueType::U32}, {"f32", ValueType::F32}, {"i64", ValueType::I64}, {"u64", ValueType::U64}, {"f64", ValueType::F64} }; static const std::map ArgAccQualV3 = { {"default", AccessQualifier::Default}, {"read_only", AccessQualifier::ReadOnly}, {"write_only", AccessQualifier::WriteOnly}, {"read_write", AccessQualifier::ReadWrite} }; static const std::map ArgAddrSpaceQualV3 = { {"private", AddressSpaceQualifier::Private}, {"global", AddressSpaceQualifier::Global}, {"constant", AddressSpaceQualifier::Constant}, {"local", AddressSpaceQualifier::Local}, {"generic", AddressSpaceQualifier::Generic}, {"region", AddressSpaceQualifier::Region} }; static const std::map KernelFieldMapV3 = { {".symbol", KernelField::SymbolName}, {".reqd_workgroup_size", KernelField::ReqdWorkGroupSize}, {".workgorup_size_hint", KernelField::WorkGroupSizeHint}, {".vec_type_hint", KernelField::VecTypeHint}, {".device_enqueue_symbol", KernelField::DeviceEnqueueSymbol}, {".kernarg_segment_size", KernelField::KernargSegmentSize}, {".group_segment_fixed_size", KernelField::GroupSegmentFixedSize}, {".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize}, {".kernarg_segment_align", KernelField::KernargSegmentAlign}, {".wavefront_size", KernelField::WavefrontSize}, {".sgpr_count", KernelField::NumSGPRs}, {".vgpr_count", KernelField::NumVGPRs}, {".max_flat_workgroup_size", KernelField::MaxFlatWorkGroupSize}, {".sgpr_spill_count", KernelField::NumSpilledSGPRs}, {".vgpr_spill_count", KernelField::NumSpilledVGPRs} }; #endif // defined(USE_COMGR_LIBRARY) #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) namespace amd { namespace hsa { namespace loader { class Symbol; } // loader namespace code { namespace Kernel { class Metadata; } // Kernel } // code } // hsa } // amd namespace amd { class Device; class KernelSignature; class NDRange; struct KernelParameterDescriptor { enum { Value = 0, HiddenNone = 1, HiddenGlobalOffsetX = 2, HiddenGlobalOffsetY = 3, HiddenGlobalOffsetZ = 4, HiddenPrintfBuffer = 5, HiddenDefaultQueue = 6, HiddenCompletionAction = 7, MemoryObject = 8, ReferenceObject = 9, ValueObject = 10, ImageObject = 11, SamplerObject = 12, QueueObject = 13 }; clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack size_t size_; //!< Its size in bytes union InfoData { struct { uint32_t oclObject_ : 4; //!< OCL object type uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA uint32_t defined_ : 1; //!< The argument was defined by the app uint32_t reserved_ : 1; //!< reserved uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment }; uint32_t allValues_; InfoData() : allValues_(0) {} } info_; cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier std::string name_; //!< The parameter's name in the source std::string typeName_; //!< Argument's type name }; } namespace device { class Program; //! Printf info structure struct PrintfInfo { std::string fmtString_; //!< formated string for printf std::vector arguments_; //!< passed arguments to the printf() call }; //! \class DeviceKernel, which will contain the common fields for any device class Kernel : public amd::HeapObject { public: typedef std::vector parameters_t; //! \struct The device kernel workgroup info structure struct WorkGroupInfo : public amd::EmbeddedObject { size_t size_; //!< kernel workgroup size size_t compileSize_[3]; //!< kernel compiled workgroup size cl_ulong localMemSize_; //!< amount of used local memory size_t preferredSizeMultiple_; //!< preferred multiple for launch cl_ulong privateMemSize_; //!< amount of used private memory size_t scratchRegs_; //!< amount of used scratch registers size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD size_t wavefrontSize_; //!< number of threads per wavefront size_t availableGPRs_; //!< GPRs available to the program size_t usedGPRs_; //!< GPRs used by the program size_t availableSGPRs_; //!< SGPRs available to the program size_t usedSGPRs_; //!< SGPRs used by the program size_t availableVGPRs_; //!< VGPRs available to the program size_t usedVGPRs_; //!< VGPRs used by the program size_t availableLDSSize_; //!< available LDS size size_t usedLDSSize_; //!< used LDS size size_t availableStackSize_; //!< available stack size size_t usedStackSize_; //!< used stack size size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint std::string compileVecTypeHint_; //!< kernel compiled vector type hint bool uniformWorkGroupSize_; //!< uniform work group size option size_t wavesPerSimdHint_; //!< waves per simd hit }; //! Default constructor Kernel(const amd::Device& dev, const std::string& name, const Program& prog); //! Default destructor virtual ~Kernel(); //! Returns the kernel info structure const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } //! Returns the kernel signature const amd::KernelSignature& signature() const { return *signature_; } //! Returns the kernel name const std::string& name() const { return name_; } //! Initializes the kernel parameters for the abstraction layer bool createSignature( const parameters_t& params, uint32_t numParameters, uint32_t version); void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; } bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; } void setReqdWorkGroupSize(size_t x, size_t y, size_t z) { workGroupInfo_.compileSize_[0] = x; workGroupInfo_.compileSize_[1] = y; workGroupInfo_.compileSize_[2] = z; } size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } void setWorkGroupSizeHint(size_t x, size_t y, size_t z) { workGroupInfo_.compileSizeHint_[0] = x; workGroupInfo_.compileSizeHint_[1] = y; workGroupInfo_.compileSizeHint_[2] = z; } size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } //! Get profiling callback object amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { return waveLimiter_.getProfilingCallback(vdev); }; //! Get waves per shader array to be used for kernel execution. uint getWavesPerSH(const device::VirtualDevice* vdev) const { return waveLimiter_.getWavesPerSH(vdev); }; //! Returns GPU device object, associated with this kernel const amd::Device& dev() const { return dev_; } void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; } //! Return the build log const std::string& buildLog() const { return buildLog_; } static std::string openclMangledName(const std::string& name); const std::unordered_map& patch() const { return patchReferences_; } //! Returns TRUE if kernel uses dynamic parallelism bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } //! set dynamic parallelism flag void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; } //! Returns TRUE if kernel is internal kernel bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } //! set internal kernel flag void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } //! Return TRUE if kernel uses images bool imageEnable() const { return (flags_.imageEna_) ? true : false; } //! Return TRUE if kernel wirtes images bool imageWrite() const { return (flags_.imageWriteEna_) ? true : false; } //! Returns TRUE if it's a HSA kernel bool hsa() const { return (flags_.hsa_) ? true : false; } //! Return printf info array const std::vector& printfInfo() const { return printf_; } //! Finds local workgroup size void FindLocalWorkSize( size_t workDim, //!< Work dimension const amd::NDRange& gblWorkSize, //!< Global work size amd::NDRange& lclWorkSize //!< Calculated local work size ) const; protected: //! Initializes the abstraction layer kernel parameters #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) void InitParameters(const amd_comgr_metadata_node_t kernelMD); //! Get ther kernel metadata bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD, const std::string& name, amd_comgr_metadata_node_t* kernelNode); //! Retrieve kernel attribute and code properties metadata bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode, KernelMD* kernelMD); //! Retrieve the available SGPRs and VGPRs bool SetAvailableSgprVgpr(const std::string& targetIdent); //! Retrieve the printf string metadata bool GetPrintfStr(const amd_comgr_metadata_node_t programMD, std::vector* printfStr); //! Returns the kernel symbol name const std::string& symbolName() const { return symbolName_; } //! Returns the kernel code object version const uint32_t codeObjectVer() const { return prog().codeObjectVer(); } #else void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize); #endif //! Initializes HSAIL Printf metadata and info for LC void InitPrintf(const std::vector& printfInfoStrings); #endif #if defined(WITH_COMPILER_LIB) void InitParameters( const aclArgData* aclArg, //!< List of ACL arguments uint32_t argBufferSize ); //! Initializes HSAIL Printf metadata and info void InitPrintf(const aclPrintfFmt* aclPrintf); #endif //! Returns program associated with this kernel const Program& prog() const { return prog_; } const amd::Device& dev_; //!< GPU device object std::string name_; //!< kernel name const Program& prog_; //!< Reference to the parent program std::string symbolName_; //!< kernel symbol name WorkGroupInfo workGroupInfo_; //!< device kernel info structure amd::KernelSignature* signature_; //!< kernel signature std::string buildLog_; //!< build log std::vector printf_; //!< Format strings for GPU printf support WaveLimiterManager waveLimiter_; //!< adaptively control number of waves union Flags { struct { uint imageEna_ : 1; //!< Kernel uses images uint imageWriteEna_ : 1; //!< Kernel uses image writes uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled uint internalKernel_ : 1; //!< True: internal kernel uint hsa_ : 1; //!< HSA kernel }; uint value_; Flags() : value_(0) {} } flags_; private: //! Disable default copy constructor Kernel(const Kernel&); //! Disable operator= Kernel& operator=(const Kernel&); std::unordered_map patchReferences_; //!< Patch table for references }; #if defined(USE_COMGR_LIBRARY) static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta, std::string* str) { size_t size = 0; amd_comgr_status_t status = amd::Comgr::get_metadata_string(meta, &size, NULL); if (status == AMD_COMGR_STATUS_SUCCESS) { str->resize(size-1); // minus one to discount the null character status = amd::Comgr::get_metadata_string(meta, &size, &((*str)[0])); } return status; } #endif // defined(USE_COMGR_LIBRARY) } // namespace device