diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp index a40cd8c94d..acb52b0142 100644 --- a/projects/clr/rocclr/runtime/device/device.cpp +++ b/projects/clr/rocclr/runtime/device/device.cpp @@ -585,86 +585,6 @@ Settings::Settings() { //!< concurrent Virtual GPUs for default } -bool Kernel::createSignature( - const parameters_t& params, uint32_t numParameters, - uint32_t version) { - std::stringstream attribs; - if (workGroupInfo_.compileSize_[0] != 0) { - attribs << "reqd_work_group_size("; - for (size_t i = 0; i < 3; ++i) { - if (i != 0) { - attribs << ","; - } - - attribs << workGroupInfo_.compileSize_[i]; - } - attribs << ")"; - } - if (workGroupInfo_.compileSizeHint_[0] != 0) { - attribs << " work_group_size_hint("; - for (size_t i = 0; i < 3; ++i) { - if (i != 0) { - attribs << ","; - } - - attribs << workGroupInfo_.compileSizeHint_[i]; - } - attribs << ")"; - } - - if (!workGroupInfo_.compileVecTypeHint_.empty()) { - attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")"; - } - - // Destroy old signature if it was allocated before - // (offline devices path) - delete signature_; - signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version); - if (NULL != signature_) { - return true; - } - return false; -} - -Kernel::~Kernel() { delete signature_; } - -std::string Kernel::openclMangledName(const std::string& name) { - const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); - assert(bifSym && "symbol not found"); - return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; -} - -void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, - const amd::Coord3D region, uint mapFlags, bool entire, - amd::Image* baseMip) { - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - WriteMapInfo info = {}; - WriteMapInfo* pInfo = &info; - auto it = writeMapInfo_.find(mapAddress); - if (it != writeMapInfo_.end()) { - LogWarning("Double map of the same or overlapped region!"); - pInfo = &it->second; - } - - if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { - pInfo->origin_ = origin; - pInfo->region_ = region; - pInfo->entire_ = entire; - pInfo->unmapWrite_ = true; - } - if (mapFlags & CL_MAP_READ) { - pInfo->unmapRead_ = true; - } - pInfo->baseMip_ = baseMip; - - // Insert into the map if it's the first region - if (++pInfo->count_ == 1) { - writeMapInfo_.insert({mapAddress, info}); - } -} - Program::Program(amd::Device& device) : device_(device), type_(TYPE_NONE), diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index 067b9ee9b7..5a30609600 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -15,6 +15,7 @@ #include "amdocl/cl_kernel.h" #include "elf/elf.hpp" #include "appprofile.hpp" +#include "devkernel.hpp" #if defined(WITH_LIGHTNING_COMPILER) #include "caching/cache.hpp" @@ -54,7 +55,6 @@ class PerfCounterCommand; class ReleaseObjectCommand; class StallQueueCommand; class Marker; -class KernelSignature; class ThreadTraceCommand; class ThreadTraceMemObjectsCommand; class SignalCommand; @@ -74,9 +74,6 @@ namespace option { class Options; } // option -struct ProfilingCallback : public amd::HeapObject { - virtual void callback(ulong duration, uint32_t waves) = 0; -}; } enum OclExtensions { @@ -176,6 +173,7 @@ static constexpr int AmdVendor = 0x1002; namespace device { class ClBinary; class BlitManager; +class Kernel; //! Physical device properties. struct Info : public amd::EmbeddedObject { @@ -776,143 +774,6 @@ class Sampler : public amd::HeapObject { Sampler(const Sampler&); }; -//! \class DeviceKernel, which will contain the common fields for any device -class Kernel : public amd::HeapObject { - public: - typedef std::vector parameters_t; - - //! \struct The device kernel workgroup info structure - struct WorkGroupInfo : public amd::EmbeddedObject { - size_t size_; //!< kernel workgroup size - size_t compileSize_[3]; //!< kernel compiled workgroup size - cl_ulong localMemSize_; //!< amount of used local memory - size_t preferredSizeMultiple_; //!< preferred multiple for launch - cl_ulong privateMemSize_; //!< amount of used private memory - size_t scratchRegs_; //!< amount of used scratch registers - size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD - size_t wavefrontSize_; //!< number of threads per wavefront - size_t availableGPRs_; //!< GPRs available to the program - size_t usedGPRs_; //!< GPRs used by the program - size_t availableSGPRs_; //!< SGPRs available to the program - size_t usedSGPRs_; //!< SGPRs used by the program - size_t availableVGPRs_; //!< VGPRs available to the program - size_t usedVGPRs_; //!< VGPRs used by the program - size_t availableLDSSize_; //!< available LDS size - size_t usedLDSSize_; //!< used LDS size - size_t availableStackSize_; //!< available stack size - size_t usedStackSize_; //!< used stack size - size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint - std::string compileVecTypeHint_; //!< kernel compiled vector type hint - bool uniformWorkGroupSize_; //!< uniform work group size option - size_t wavesPerSimdHint_; //!< waves per simd hit - }; - - //! Default constructor - Kernel(const std::string& name) : name_(name), signature_(NULL), hsa_(false) { - // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); - // Due to std::string not being able to be memset to 0 - workGroupInfo_.size_ = 0; - workGroupInfo_.compileSize_[0] = 0; - workGroupInfo_.compileSize_[1] = 0; - workGroupInfo_.compileSize_[2] = 0; - workGroupInfo_.localMemSize_ = 0; - workGroupInfo_.preferredSizeMultiple_ = 0; - workGroupInfo_.privateMemSize_ = 0; - workGroupInfo_.scratchRegs_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = 0; - workGroupInfo_.wavefrontSize_ = 0; - workGroupInfo_.availableGPRs_ = 0; - workGroupInfo_.usedGPRs_ = 0; - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.usedSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - workGroupInfo_.usedVGPRs_ = 0; - workGroupInfo_.availableLDSSize_ = 0; - workGroupInfo_.usedLDSSize_ = 0; - workGroupInfo_.availableStackSize_ = 0; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.compileSizeHint_[0] = 0; - workGroupInfo_.compileSizeHint_[1] = 0; - workGroupInfo_.compileSizeHint_[2] = 0; - workGroupInfo_.compileVecTypeHint_ = ""; - workGroupInfo_.uniformWorkGroupSize_ = false; - workGroupInfo_.wavesPerSimdHint_ = 0; - } - - //! Default destructor - virtual ~Kernel(); - - //! Returns the kernel info structure - const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } - - //! Returns the kernel signature - const amd::KernelSignature& signature() const { return *signature_; } - - //! Returns the kernel name - const std::string& name() const { return name_; } - - //! Initializes the kernel parameters for the abstraction layer - bool createSignature( - const parameters_t& params, uint32_t numParameters, - uint32_t version); - - //! Returns TRUE if it's a HSA kernel - bool hsa() const { return hsa_; } - - void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; } - - bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; } - - void setReqdWorkGroupSize(size_t x, size_t y, size_t z) { - workGroupInfo_.compileSize_[0] = x; - workGroupInfo_.compileSize_[1] = y; - workGroupInfo_.compileSize_[2] = z; - } - - size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } - - void setWorkGroupSizeHint(size_t x, size_t y, size_t z) { - workGroupInfo_.compileSizeHint_[0] = x; - workGroupInfo_.compileSizeHint_[1] = y; - workGroupInfo_.compileSizeHint_[2] = z; - } - - size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } - - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) { - return NULL; - } - - virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const { - return 0; - } - - void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } - - void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } - - void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; } - - //! Return the build log - const std::string& buildLog() const { return buildLog_; } - - static std::string openclMangledName(const std::string& name); - - protected: - std::string name_; //!< kernel name - WorkGroupInfo workGroupInfo_; //!< device kernel info structure - amd::KernelSignature* signature_; //!< kernel signature - bool hsa_; //!< True if HSA kernel on GPU - std::string buildLog_; //!< build log - private: - //! Disable default copy constructor - Kernel(const Kernel&); - - //! Disable operator= - Kernel& operator=(const Kernel&); -}; - //! A program object for a specific device. class Program : public amd::HeapObject { public: @@ -1615,47 +1476,6 @@ class Device : public RuntimeObject { std::map* vaCacheMap_; //!< VA cache map }; -struct KernelParameterDescriptor { - enum { - Value = 0, - HiddenNone = 1, - HiddenGlobalOffsetX = 2, - HiddenGlobalOffsetY = 3, - HiddenGlobalOffsetZ = 4, - HiddenPrintfBuffer = 5, - HiddenDefaultQueue = 6, - HiddenCompletionAction = 7, - MemoryObject = 8, - ReferenceObject = 9, - ValueObject = 10, - ImageObject = 11, - SamplerObject = 12, - QueueObject = 13 - }; - clk_value_type_t type_; //!< The parameter's type - size_t offset_; //!< Its offset in the parameter's stack - size_t size_; //!< Its size in bytes - union InfoData { - struct { - uint32_t oclObject_ : 4; //!< OCL object type - uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only - uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA - uint32_t defined_ : 1; //!< The argument was defined by the app - uint32_t reserved_ : 1; //!< reserved - uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment - }; - uint32_t allValues_; - InfoData() : allValues_(0) {} - } info_; - - cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier - cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier - cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier - - std::string name_; //!< The parameter's name in the source - std::string typeName_; //!< Argument's type name -}; - #if defined(WITH_LIGHTNING_COMPILER) //! Compilation process with cache support. class CacheCompilation : public amd::HeapObject { diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp new file mode 100644 index 0000000000..16892f2fd0 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -0,0 +1,772 @@ +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/runtime.hpp" +#include "platform/program.hpp" +#include "devkernel.hpp" +#include "utils/macros.hpp" +#include "utils/options.hpp" +#include "utils/bif_section_labels.hpp" +#include "utils/libUtils.h" + +#include +#include + +#include "acl.h" + +#if defined(WITH_LIGHTNING_COMPILER) +#include "llvm/Support/AMDGPUMetadata.h" + +typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; +#endif // defined(WITH_LIGHTNING_COMPILER) + +namespace device { + +bool Kernel::createSignature( + const parameters_t& params, uint32_t numParameters, + uint32_t version) { + std::stringstream attribs; + if (workGroupInfo_.compileSize_[0] != 0) { + attribs << "reqd_work_group_size("; + for (size_t i = 0; i < 3; ++i) { + if (i != 0) { + attribs << ","; + } + + attribs << workGroupInfo_.compileSize_[i]; + } + attribs << ")"; + } + if (workGroupInfo_.compileSizeHint_[0] != 0) { + attribs << " work_group_size_hint("; + for (size_t i = 0; i < 3; ++i) { + if (i != 0) { + attribs << ","; + } + + attribs << workGroupInfo_.compileSizeHint_[i]; + } + attribs << ")"; + } + + if (!workGroupInfo_.compileVecTypeHint_.empty()) { + attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")"; + } + + // Destroy old signature if it was allocated before + // (offline devices path) + delete signature_; + signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version); + if (NULL != signature_) { + return true; + } + return false; +} + +Kernel::~Kernel() { delete signature_; } + +std::string Kernel::openclMangledName(const std::string& name) { + const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); + assert(bifSym && "symbol not found"); + return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; +} + +void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, + const amd::Coord3D region, uint mapFlags, bool entire, + amd::Image* baseMip) { + // Map/Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); + + WriteMapInfo info = {}; + WriteMapInfo* pInfo = &info; + auto it = writeMapInfo_.find(mapAddress); + if (it != writeMapInfo_.end()) { + LogWarning("Double map of the same or overlapped region!"); + pInfo = &it->second; + } + + if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { + pInfo->origin_ = origin; + pInfo->region_ = region; + pInfo->entire_ = entire; + pInfo->unmapWrite_ = true; + } + if (mapFlags & CL_MAP_READ) { + pInfo->unmapRead_ = true; + } + pInfo->baseMip_ = baseMip; + + // Insert into the map if it's the first region + if (++pInfo->count_ == 1) { + writeMapInfo_.insert({ mapAddress, info }); + } +} + +#if defined(WITH_LIGHTNING_COMPILER) +using llvm::AMDGPU::HSAMD::AccessQualifier; +using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; +using llvm::AMDGPU::HSAMD::ValueKind; +using llvm::AMDGPU::HSAMD::ValueType; + +static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) { + switch (lcArg.mValueKind) { + case ValueKind::GlobalBuffer: + case ValueKind::DynamicSharedPointer: + case ValueKind::Pipe: + return amd::KernelParameterDescriptor::MemoryObject; + case ValueKind::ByValue: + return amd::KernelParameterDescriptor::ValueObject; + case ValueKind::Image: + return amd::KernelParameterDescriptor::ImageObject; + case ValueKind::Sampler: + return amd::KernelParameterDescriptor::SamplerObject; + case ValueKind::HiddenGlobalOffsetX: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; + case ValueKind::HiddenGlobalOffsetY: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; + case ValueKind::HiddenGlobalOffsetZ: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; + case ValueKind::HiddenPrintfBuffer: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenPrintfBuffer; + case ValueKind::HiddenDefaultQueue: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenDefaultQueue; + case ValueKind::HiddenCompletionAction: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenCompletionAction; + case ValueKind::HiddenNone: + default: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenNone; + } +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) { + if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { + *isHidden = true; + if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { + return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; + } + else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { + return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; + } + else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { + return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; + } + else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { + return amd::KernelParameterDescriptor::HiddenPrintfBuffer; + } + else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { + return amd::KernelParameterDescriptor::HiddenDefaultQueue; + } + else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { + return amd::KernelParameterDescriptor::HiddenCompletionAction; + } + return amd::KernelParameterDescriptor::HiddenNone; + } + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return amd::KernelParameterDescriptor::MemoryObject; + case ARG_TYPE_QUEUE: + return amd::KernelParameterDescriptor::QueueObject; + case ARG_TYPE_VALUE: + return (argInfo->arg.value.data == DATATYPE_struct) ? + amd::KernelParameterDescriptor::ReferenceObject : + amd::KernelParameterDescriptor::ValueObject; + case ARG_TYPE_IMAGE: + return amd::KernelParameterDescriptor::ImageObject; + case ARG_TYPE_SAMPLER: + return amd::KernelParameterDescriptor::SamplerObject; + case ARG_TYPE_ERROR: + default: + return amd::KernelParameterDescriptor::HiddenNone; +} +} +#endif + +static const clk_value_type_t ClkValueMapType[6][6] = { + { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, + { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, + { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, + { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, + { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, + { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, +}; + +#if defined(WITH_LIGHTNING_COMPILER) +static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) { + uint sizeType; + uint numElements; + + if (lcArg.mValueKind != ValueKind::ByValue) { + switch (lcArg.mValueKind) { + case ValueKind::GlobalBuffer: + case ValueKind::DynamicSharedPointer: + case ValueKind::Pipe: + case ValueKind::Image: + return T_POINTER; + case ValueKind::Sampler: + return T_SAMPLER; + default: + return T_VOID; + } + } + else { + switch (lcArg.mValueType) { + case ValueType::I8: + case ValueType::U8: + sizeType = 0; + numElements = size; + break; + case ValueType::I16: + case ValueType::U16: + sizeType = 1; + numElements = size / 2; + break; + case ValueType::I32: + case ValueType::U32: + sizeType = 2; + numElements = size / 4; + break; + case ValueType::I64: + case ValueType::U64: + sizeType = 3; + numElements = size / 8; + break; + case ValueType::F16: + sizeType = 4; + numElements = size / 2; + break; + case ValueType::F32: + sizeType = 4; + numElements = size / 4; + break; + case ValueType::F64: + sizeType = 5; + numElements = size / 8; + break; + case ValueType::Struct: + default: + return T_VOID; + } + switch (numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } + } + return T_VOID; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) { + uint sizeType; + uint numElements; + if (argInfo->type == ARG_TYPE_QUEUE) { + return T_QUEUE; + } + else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) { + return T_POINTER; + } + else if (argInfo->type == ARG_TYPE_VALUE) { + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + sizeType = 0; + numElements = size; + break; + case DATATYPE_i16: + case DATATYPE_u16: + sizeType = 1; + numElements = size / 2; + break; + case DATATYPE_i32: + case DATATYPE_u32: + sizeType = 2; + numElements = size / 4; + break; + case DATATYPE_i64: + case DATATYPE_u64: + sizeType = 3; + numElements = size / 8; + break; + case DATATYPE_f16: + sizeType = 4; + numElements = size / 2; + break; + case DATATYPE_f32: + sizeType = 4; + numElements = size / 4; + break; + case DATATYPE_f64: + sizeType = 5; + numElements = size / 8; + break; + case DATATYPE_struct: + case DATATYPE_opaque: + case DATATYPE_ERROR: + default: + return T_VOID; + } + + switch (numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } + } + else if (argInfo->type == ARG_TYPE_SAMPLER) { + return T_SAMPLER; + } + else { + return T_VOID; + } +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; } +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + return 1; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8; + case DATATYPE_struct: + return 128; + case DATATYPE_ERROR: + default: + return -1; + } + case ARG_TYPE_IMAGE: + return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: + return sizeof(cl_sampler); + default: + return -1; + } +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + uint32_t align = lcArg.mPointeeAlign; + if (align == 0) { + LogWarning("Missing DynamicSharedPointer alignment"); + align = 128; /* worst case alignment */ + } + return align; + } + return 1; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + return argInfo->arg.pointer.align; + } + return 1; +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) { + if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) { + switch (lcArg.mAccQual) { + case AccessQualifier::ReadOnly: + return true; + case AccessQualifier::WriteOnly: + case AccessQualifier::ReadWrite: + default: + return false; + } + } + return false; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false; + } + else if (argInfo->type == ARG_TYPE_IMAGE) { + return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false; + } + return false; +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; } +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +inline static int GetArgSizeOCL(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + case DATATYPE_struct: + return 1 * argInfo->arg.value.numElements; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * argInfo->arg.value.numElements; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * argInfo->arg.value.numElements; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * argInfo->arg.value.numElements; + case DATATYPE_ERROR: + default: + return -1; + } + case ARG_TYPE_IMAGE: + case ARG_TYPE_SAMPLER: + case ARG_TYPE_QUEUE: + return sizeof(void*); + default: + return -1; + } +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + return CL_KERNEL_ARG_ADDRESS_LOCAL; + } + else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { + if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global || + lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + } + LogError("Unsupported address type"); + return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } + else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + // default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + case PTR_MT_CONSTANT: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + case PTR_MT_UAV: + case PTR_MT_GLOBAL: + case PTR_MT_SCRATCH_EMU: + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + case PTR_MT_ERROR: + default: + LogError("Unsupported address type"); + return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } + } + else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + + // default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::Image) { + switch (lcArg.mAccQual) { + case AccessQualifier::ReadOnly: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case AccessQualifier::WriteOnly: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + case AccessQualifier::ReadWrite: + default: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + } + } + return CL_KERNEL_ARG_ACCESS_NONE; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_IMAGE) { + switch (argInfo->arg.image.type) { + case ACCESS_TYPE_RO: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case ACCESS_TYPE_WO: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + default: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + } + } + return CL_KERNEL_ARG_ACCESS_NONE; +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (lcArg.mValueKind == ValueKind::GlobalBuffer || + lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + if (lcArg.mIsVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + if (lcArg.mIsRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (lcArg.mIsConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + } + else if (lcArg.mIsPipe) { + assert(lcArg.mValueKind == ValueKind::Pipe); + rv |= CL_KERNEL_ARG_TYPE_PIPE; + } + return rv; +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (argInfo->type == ARG_TYPE_POINTER) { + if (argInfo->arg.pointer.isVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + if (argInfo->arg.pointer.isRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (argInfo->arg.pointer.isPipe) { + rv |= CL_KERNEL_ARG_TYPE_PIPE; + } + if (argInfo->isConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + rv |= CL_KERNEL_ARG_TYPE_CONST; + break; + default: + break; + } + } + return rv; +} +#endif + +#if defined(WITH_LIGHTNING_COMPILER) +void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) { + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + size_t offsetStruct = argBufferSize; + + for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { + const KernelArgMD& lcArg = kernelMD.mArgs[i]; + + size_t size = GetArgSizeOCL(lcArg); + size_t alignment = GetArgAlignmentOCL(lcArg); + bool isHidden = false; + desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden); + + // Allocate the hidden arguments, but abstraction layer will skip them + if (isHidden) { + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + desc.size_ = size; + offset += size; + hiddenParams.push_back(desc); + continue; + } + + desc.name_ = lcArg.mName.c_str(); + desc.type_ = GetOclTypeOCL(lcArg, size); + desc.typeName_ = lcArg.mTypeName.c_str(); + + desc.addressQualifier_ = GetOclAddrQualOCL(lcArg); + desc.accessQualifier_ = GetOclAccessQualOCL(lcArg); + desc.typeQualifier_ = GetOclTypeQualOCL(lcArg); + desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg); + desc.size_ = size; + + // These objects have forced data size to uint64_t + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } + else { + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + offset += size; + } + + // Update read only flag + desc.info_.readOnly_ = GetReadOnlyOCL(lcArg); + + params.push_back(desc); + + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + + // Save the number of OCL arguments + uint32_t numParams = params.size(); + // Append the hidden arguments to the OCL arguments + params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); + createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); +} +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) +void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) { + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + size_t offsetStruct = argBufferSize; + + for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { + size_t size = GetArgSizeOCL(aclArg); + size_t alignment = GetArgAlignmentOCL(aclArg); + bool isHidden = false; + desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden); + + // Allocate the hidden arguments, but abstraction layer will skip them + if (isHidden) { + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + desc.size_ = size; + offset += size; + hiddenParams.push_back(desc); + continue; + } + + desc.name_ = aclArg->argStr; + desc.typeName_ = aclArg->typeStr; + desc.type_ = GetOclTypeOCL(aclArg, size); + + desc.addressQualifier_ = GetOclAddrQualOCL(aclArg); + desc.accessQualifier_ = GetOclAccessQualOCL(aclArg); + desc.typeQualifier_ = GetOclTypeQualOCL(aclArg); + desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg); + desc.size_ = size; + + // Check if HSAIL expects data by reference and allocate it behind + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { + desc.offset_ = offsetStruct; + // Align the offset reference + offset = amd::alignUp(offset, sizeof(size_t)); + patchReferences_.insert({ desc.offset_, offset }); + offsetStruct += size; + // Adjust the offset of arguments + offset += sizeof(size_t); + } + else { + // These objects have forced data size to uint64_t + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } + else { + offset = amd::alignUp(offset, alignment); + desc.offset_ = offset; + offset += size; + } + } + // Update read only flag + desc.info_.readOnly_ = GetReadOnlyOCL(aclArg); + + params.push_back(desc); + + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + // Save the number of OCL arguments + uint32_t numParams = params.size(); + // Append the hidden arguments to the OCL arguments + params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); + createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); +} +#endif + +} \ No newline at end of file diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp new file mode 100644 index 0000000000..08066cd15b --- /dev/null +++ b/projects/clr/rocclr/runtime/device/devkernel.hpp @@ -0,0 +1,269 @@ +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// +#pragma once + +#include "include/aclTypes.h" +#include "platform/context.hpp" +#include "platform/object.hpp" +#include "platform/memory.hpp" + +#if defined(WITH_LIGHTNING_COMPILER) +namespace llvm { + namespace AMDGPU { + namespace HSAMD { + namespace Kernel { + struct Metadata; +}}}} +typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; +#endif // defined(WITH_LIGHTNING_COMPILER) + +namespace amd { + namespace hsa { + namespace loader { + class Symbol; + } // loader + namespace code { + namespace Kernel { + class Metadata; + } // Kernel + } // code + } // hsa +} // amd + +namespace amd { + +class Device; +class KernelSignature; + +struct ProfilingCallback : public amd::HeapObject { + virtual void callback(ulong duration, uint32_t waves) = 0; +}; + +struct KernelParameterDescriptor { + enum { + Value = 0, + HiddenNone = 1, + HiddenGlobalOffsetX = 2, + HiddenGlobalOffsetY = 3, + HiddenGlobalOffsetZ = 4, + HiddenPrintfBuffer = 5, + HiddenDefaultQueue = 6, + HiddenCompletionAction = 7, + MemoryObject = 8, + ReferenceObject = 9, + ValueObject = 10, + ImageObject = 11, + SamplerObject = 12, + QueueObject = 13 + }; + clk_value_type_t type_; //!< The parameter's type + size_t offset_; //!< Its offset in the parameter's stack + size_t size_; //!< Its size in bytes + union InfoData { + struct { + uint32_t oclObject_ : 4; //!< OCL object type + uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only + uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA + uint32_t defined_ : 1; //!< The argument was defined by the app + uint32_t reserved_ : 1; //!< reserved + uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment + }; + uint32_t allValues_; + InfoData() : allValues_(0) {} + } info_; + + cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier + cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier + cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier + + std::string name_; //!< The parameter's name in the source + std::string typeName_; //!< Argument's type name +}; + +} + +namespace device { + +//! \class DeviceKernel, which will contain the common fields for any device +class Kernel : public amd::HeapObject { + public: + typedef std::vector parameters_t; + + //! \struct The device kernel workgroup info structure + struct WorkGroupInfo : public amd::EmbeddedObject { + size_t size_; //!< kernel workgroup size + size_t compileSize_[3]; //!< kernel compiled workgroup size + cl_ulong localMemSize_; //!< amount of used local memory + size_t preferredSizeMultiple_; //!< preferred multiple for launch + cl_ulong privateMemSize_; //!< amount of used private memory + size_t scratchRegs_; //!< amount of used scratch registers + size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD + size_t wavefrontSize_; //!< number of threads per wavefront + size_t availableGPRs_; //!< GPRs available to the program + size_t usedGPRs_; //!< GPRs used by the program + size_t availableSGPRs_; //!< SGPRs available to the program + size_t usedSGPRs_; //!< SGPRs used by the program + size_t availableVGPRs_; //!< VGPRs available to the program + size_t usedVGPRs_; //!< VGPRs used by the program + size_t availableLDSSize_; //!< available LDS size + size_t usedLDSSize_; //!< used LDS size + size_t availableStackSize_; //!< available stack size + size_t usedStackSize_; //!< used stack size + size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint + std::string compileVecTypeHint_; //!< kernel compiled vector type hint + bool uniformWorkGroupSize_; //!< uniform work group size option + size_t wavesPerSimdHint_; //!< waves per simd hit + }; + + //! Default constructor + Kernel(const std::string& name) : name_(name), signature_(NULL) { + // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); + // Due to std::string not being able to be memset to 0 + workGroupInfo_.size_ = 0; + workGroupInfo_.compileSize_[0] = 0; + workGroupInfo_.compileSize_[1] = 0; + workGroupInfo_.compileSize_[2] = 0; + workGroupInfo_.localMemSize_ = 0; + workGroupInfo_.preferredSizeMultiple_ = 0; + workGroupInfo_.privateMemSize_ = 0; + workGroupInfo_.scratchRegs_ = 0; + workGroupInfo_.wavefrontPerSIMD_ = 0; + workGroupInfo_.wavefrontSize_ = 0; + workGroupInfo_.availableGPRs_ = 0; + workGroupInfo_.usedGPRs_ = 0; + workGroupInfo_.availableSGPRs_ = 0; + workGroupInfo_.usedSGPRs_ = 0; + workGroupInfo_.availableVGPRs_ = 0; + workGroupInfo_.usedVGPRs_ = 0; + workGroupInfo_.availableLDSSize_ = 0; + workGroupInfo_.usedLDSSize_ = 0; + workGroupInfo_.availableStackSize_ = 0; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.compileSizeHint_[0] = 0; + workGroupInfo_.compileSizeHint_[1] = 0; + workGroupInfo_.compileSizeHint_[2] = 0; + workGroupInfo_.compileVecTypeHint_ = ""; + workGroupInfo_.uniformWorkGroupSize_ = false; + workGroupInfo_.wavesPerSimdHint_ = 0; + } + + //! Default destructor + virtual ~Kernel(); + + //! Returns the kernel info structure + const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } + + //! Returns the kernel signature + const amd::KernelSignature& signature() const { return *signature_; } + + //! Returns the kernel name + const std::string& name() const { return name_; } + + //! Initializes the kernel parameters for the abstraction layer + bool createSignature( + const parameters_t& params, uint32_t numParameters, + uint32_t version); + + void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; } + + bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; } + + void setReqdWorkGroupSize(size_t x, size_t y, size_t z) { + workGroupInfo_.compileSize_[0] = x; + workGroupInfo_.compileSize_[1] = y; + workGroupInfo_.compileSize_[2] = z; + } + + size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } + + void setWorkGroupSizeHint(size_t x, size_t y, size_t z) { + workGroupInfo_.compileSizeHint_[0] = x; + workGroupInfo_.compileSizeHint_[1] = y; + workGroupInfo_.compileSizeHint_[2] = z; + } + + size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } + + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) { + return NULL; + } + + virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const { + return 0; + } + + void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } + + void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } + + void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; } + + //! Return the build log + const std::string& buildLog() const { return buildLog_; } + + static std::string openclMangledName(const std::string& name); + + const std::unordered_map& patch() const { return patchReferences_; } + + //! Returns TRUE if kernel uses dynamic parallelism + bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } + + //! set dynamic parallelism flag + void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; } + + //! Returns TRUE if kernel is internal kernel + bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } + + //! set internal kernel flag + void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } + + //! Return TRUE if kernel uses images + bool imageEnable() const { return (flags_.imageEna_) ? true : false; } + + //! Return TRUE if kernel wirtes images + bool imageWrite() const { return (flags_.imageWriteEna_) ? true : false; } + + //! Returns TRUE if it's a HSA kernel + bool hsa() const { return (flags_.hsa_) ? true : false; } + + protected: + //! Initializes the abstraction layer kernel parameters +#if defined(WITH_LIGHTNING_COMPILER) + void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize); +#endif +#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER) + void InitParameters( + const aclArgData* aclArg, //!< List of ACL arguments + uint32_t argBufferSize + ); +#endif + std::string name_; //!< kernel name + WorkGroupInfo workGroupInfo_; //!< device kernel info structure + amd::KernelSignature* signature_; //!< kernel signature + std::string buildLog_; //!< build log + + union Flags { + struct { + uint imageEna_ : 1; //!< Kernel uses images + uint imageWriteEna_ : 1; //!< Kernel uses image writes + uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled + uint internalKernel_ : 1; //!< True: internal kernel + uint hsa_ : 1; //!< HSA kernel + }; + uint value_; + Flags() : value_(0) {} + } flags_; + + private: + //! Disable default copy constructor + Kernel(const Kernel&); + + //! Disable operator= + Kernel& operator=(const Kernel&); + + std::unordered_map patchReferences_; //!< Patch table for references +}; + +} // namespace device \ No newline at end of file diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index d0a2377b61..dd26970033 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -823,7 +823,6 @@ Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& pro workGroupInfo_.privateMemSize_ = hwPrivateSize_; // Default wavesPerSimdHint_ workGroupInfo_.wavesPerSimdHint_ = ~0U; - hsa_ = false; } Kernel::~Kernel() { @@ -3127,7 +3126,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi extraArgumentsNum_(extraArgsNum), waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) * dev().hwInfo()->simdPerCU_) { - hsa_ = true; + flags_.hsa_ = true; } HSAILKernel::~HSAILKernel() { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index 029ecb1946..4eae7a7ef5 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel { //! Returns spill reg size per workitem int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; } - //! Returns TRUE if kernel uses dynamic parallelism - bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } - - //! Returns TRUE if kernel is internal kernel - bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } - //! Finds local workgroup size void findLocalWorkSize(size_t workDim, //!< Work dimension const amd::NDRange& gblWorkSize, //!< Global work size @@ -895,17 +889,6 @@ class HSAILKernel : public device::Kernel { uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments - union Flags { - struct { - uint imageEna_ : 1; //!< Kernel uses images - uint imageWriteEna_ : 1; //!< Kernel uses image writes - uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled - uint internalKernel_ : 1; //!< True: internal kernel - }; - uint value_; - Flags() : value_(0) {} - } flags_; - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index 7d8f4b2382..5c8ab4e71d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -21,654 +21,6 @@ namespace pal { -#if defined(WITH_LIGHTNING_COMPILER) -using llvm::AMDGPU::HSAMD::AccessQualifier; -using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; -using llvm::AMDGPU::HSAMD::ValueKind; -using llvm::AMDGPU::HSAMD::ValueType; - -static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) { - switch (lcArg.mValueKind) { - case ValueKind::GlobalBuffer: - case ValueKind::DynamicSharedPointer: - case ValueKind::Pipe: - return amd::KernelParameterDescriptor::MemoryObject; - case ValueKind::ByValue: - return amd::KernelParameterDescriptor::ValueObject; - case ValueKind::Image: - return amd::KernelParameterDescriptor::ImageObject; - case ValueKind::Sampler: - return amd::KernelParameterDescriptor::SamplerObject; - case ValueKind::HiddenGlobalOffsetX: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; - case ValueKind::HiddenGlobalOffsetY: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; - case ValueKind::HiddenGlobalOffsetZ: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; - case ValueKind::HiddenPrintfBuffer: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenPrintfBuffer; - case ValueKind::HiddenDefaultQueue: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenDefaultQueue; - case ValueKind::HiddenCompletionAction: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenCompletionAction; - case ValueKind::HiddenNone: - default: - *isHidden = true; - return amd::KernelParameterDescriptor::HiddenNone; - } -} -#else -static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) { - if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { - *isHidden = true; - if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { - return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; - } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { - return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; - } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { - return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; - } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { - return amd::KernelParameterDescriptor::HiddenPrintfBuffer; - } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { - return amd::KernelParameterDescriptor::HiddenDefaultQueue; - } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { - return amd::KernelParameterDescriptor::HiddenCompletionAction; - } - return amd::KernelParameterDescriptor::HiddenNone; - } - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return amd::KernelParameterDescriptor::MemoryObject; - case ARG_TYPE_QUEUE: - return amd::KernelParameterDescriptor::QueueObject; - case ARG_TYPE_VALUE: - return (argInfo->arg.value.data == DATATYPE_struct) ? - amd::KernelParameterDescriptor::ReferenceObject : - amd::KernelParameterDescriptor::ValueObject; - case ARG_TYPE_IMAGE: - return amd::KernelParameterDescriptor::ImageObject; - case ARG_TYPE_SAMPLER: - return amd::KernelParameterDescriptor::SamplerObject; - case ARG_TYPE_ERROR: - default: - return amd::KernelParameterDescriptor::HiddenNone; - } -} -#endif - -static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, -}; - -#if defined(WITH_LIGHTNING_COMPILER) -static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) { - uint sizeType; - uint numElements; - - if (lcArg.mValueKind != ValueKind::ByValue) { - switch (lcArg.mValueKind) { - case ValueKind::GlobalBuffer: - case ValueKind::DynamicSharedPointer: - case ValueKind::Pipe: - case ValueKind::Image: - return T_POINTER; - case ValueKind::Sampler: - return T_SAMPLER; - default: - return T_VOID; - } - } else { - switch (lcArg.mValueType) { - case ValueType::I8: - case ValueType::U8: - sizeType = 0; - numElements = size; - break; - case ValueType::I16: - case ValueType::U16: - sizeType = 1; - numElements = size / 2; - break; - case ValueType::I32: - case ValueType::U32: - sizeType = 2; - numElements = size / 4; - break; - case ValueType::I64: - case ValueType::U64: - sizeType = 3; - numElements = size / 8; - break; - case ValueType::F16: - sizeType = 4; - numElements = size / 2; - break; - case ValueType::F32: - sizeType = 4; - numElements = size / 4; - break; - case ValueType::F64: - sizeType = 5; - numElements = size / 8; - break; - case ValueType::Struct: - default: - return T_VOID; - } - switch (numElements) { - case 1: - return ClkValueMapType[sizeType][0]; - case 2: - return ClkValueMapType[sizeType][1]; - case 3: - return ClkValueMapType[sizeType][2]; - case 4: - return ClkValueMapType[sizeType][3]; - case 8: - return ClkValueMapType[sizeType][4]; - case 16: - return ClkValueMapType[sizeType][5]; - default: - return T_VOID; - } - } - return T_VOID; -} -#else -static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) { - uint sizeType; - uint numElements; - if (argInfo->type == ARG_TYPE_QUEUE) { - return T_QUEUE; - } - else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) { - return T_POINTER; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - sizeType = 0; - numElements = size; - break; - case DATATYPE_i16: - case DATATYPE_u16: - sizeType = 1; - numElements = size / 2; - break; - case DATATYPE_i32: - case DATATYPE_u32: - sizeType = 2; - numElements = size / 4; - break; - case DATATYPE_i64: - case DATATYPE_u64: - sizeType = 3; - numElements = size / 8; - break; - case DATATYPE_f16: - sizeType = 4; - numElements = size / 2; - break; - case DATATYPE_f32: - sizeType = 4; - numElements = size / 4; - break; - case DATATYPE_f64: - sizeType = 5; - numElements = size / 8; - break; - case DATATYPE_struct: - case DATATYPE_opaque: - case DATATYPE_ERROR: - default: - return T_VOID; - } - - switch (numElements) { - case 1: - return ClkValueMapType[sizeType][0]; - case 2: - return ClkValueMapType[sizeType][1]; - case 3: - return ClkValueMapType[sizeType][2]; - case 4: - return ClkValueMapType[sizeType][3]; - case 8: - return ClkValueMapType[sizeType][4]; - case 16: - return ClkValueMapType[sizeType][5]; - default: - return T_VOID; - } - } - else if (argInfo->type == ARG_TYPE_SAMPLER) { - return T_SAMPLER; - } - else { - return T_VOID; - } -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; } -#else -static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) { - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return sizeof(void*); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - return 1; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8; - case DATATYPE_struct: - return 128; - case DATATYPE_ERROR: - default: - return -1; - } - case ARG_TYPE_IMAGE: - return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: - return sizeof(cl_sampler); - default: - return -1; - } -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - uint32_t align = lcArg.mPointeeAlign; - if (align == 0) { - LogWarning("Missing DynamicSharedPointer alignment"); - align = 128; /* worst case alignment */ - } - return align; - } - return 1; -} -#else -static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_POINTER) { - return argInfo->arg.pointer.align; - } - return 1; -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) { - if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return true; - case AccessQualifier::WriteOnly: - case AccessQualifier::ReadWrite: - default: - return false; - } - } - return false; -} -#else -static inline bool GetReadOnlyOCL(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_POINTER) { - return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false; - } else if (argInfo->type == ARG_TYPE_IMAGE) { - return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false; - } - return false; -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; } -#else -inline static int GetArgSizeOCL(const aclArgData* argInfo) { - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return sizeof(void*); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: - return -1; - } - case ARG_TYPE_IMAGE: - case ARG_TYPE_SAMPLER: - case ARG_TYPE_QUEUE: - return sizeof(void*); - default: - return -1; - } -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - return CL_KERNEL_ARG_ADDRESS_LOCAL; - } - else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { - if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - } - LogError("Unsupported address type"); - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - // default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} -#else -static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - case PTR_MT_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - case PTR_MT_SCRATCH_EMU: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - - // default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::Image) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case AccessQualifier::WriteOnly: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case AccessQualifier::ReadWrite: - default: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} -#else -static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_IMAGE) { - switch (argInfo->arg.image.type) { - case ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - default: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) { - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - if (lcArg.mIsVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (lcArg.mIsRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (lcArg.mIsConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - } - else if (lcArg.mIsPipe) { - assert(lcArg.mValueKind == ValueKind::Pipe); - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - return rv; -} -#else -static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) { - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->arg.pointer.isPipe) { - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } - } - return rv; -} -#endif - -#if defined(WITH_LIGHTNING_COMPILER) -void LightningKernel::initArgList(const KernelMD& kernelMD) { - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - device::Kernel::parameters_t hiddenParams; - amd::KernelParameterDescriptor desc; - size_t offset = 0; - size_t offsetStruct = argsBufferSize(); - - for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { - const KernelArgMD& lcArg = kernelMD.mArgs[i]; - - size_t size = GetArgSizeOCL(lcArg); - size_t alignment = GetArgAlignmentOCL(lcArg); - bool isHidden = false; - desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden); - - // Allocate the hidden arguments, but abstraction layer will skip them - if (isHidden) { - offset = amd::alignUp(offset, alignment); - desc.offset_ = offset; - desc.size_ = size; - offset += size; - hiddenParams.push_back(desc); - continue; - } - - desc.name_ = lcArg.mName.c_str(); - desc.type_ = GetOclTypeOCL(lcArg, size); - desc.typeName_ = lcArg.mTypeName.c_str(); - - desc.addressQualifier_ = GetOclAddrQualOCL(lcArg); - desc.accessQualifier_ = GetOclAccessQualOCL(lcArg); - desc.typeQualifier_ = GetOclTypeQualOCL(lcArg); - desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg); - desc.size_ = size; - - // These objects have forced data size to uint64_t - if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { - offset = amd::alignUp(offset, sizeof(uint64_t)); - desc.offset_ = offset; - offset += sizeof(uint64_t); - } - else { - offset = amd::alignUp(offset, alignment); - desc.offset_ = offset; - offset += size; - } - - // Update read only flag - desc.info_.readOnly_ = GetReadOnlyOCL(lcArg); - - params.push_back(desc); - - if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { - flags_.imageEna_ = true; - if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { - flags_.imageWriteEna_ = true; - } - } - } - - // Save the number of OCL arguments - uint32_t numParams = params.size(); - // Append the hidden arguments to the OCL arguments - params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); - createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); -} -#else -void HSAILKernel::initArgList(const aclArgData* aclArg) { - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - device::Kernel::parameters_t hiddenParams; - amd::KernelParameterDescriptor desc; - size_t offset = 0; - size_t offsetStruct = argsBufferSize(); - - for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - size_t size = GetArgSizeOCL(aclArg); - size_t alignment = GetArgAlignmentOCL(aclArg); - bool isHidden = false; - desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden); - - // Allocate the hidden arguments, but abstraction layer will skip them - if (isHidden) { - offset = amd::alignUp(offset, alignment); - desc.offset_ = offset; - desc.size_ = size; - offset += size; - hiddenParams.push_back(desc); - continue; - } - - desc.name_ = aclArg->argStr; - desc.typeName_ = aclArg->typeStr; - desc.type_ = GetOclTypeOCL(aclArg, size); - - desc.addressQualifier_ = GetOclAddrQualOCL(aclArg); - desc.accessQualifier_ = GetOclAccessQualOCL(aclArg); - desc.typeQualifier_ = GetOclTypeQualOCL(aclArg); - desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg); - desc.size_ = size; - - // Check if HSAIL expects data by reference and allocate it behind - if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { - desc.offset_ = offsetStruct; - // Align the offset reference - offset = amd::alignUp(offset, sizeof(size_t)); - patchReferences_.insert({ desc.offset_, offset }); - offsetStruct += size; - // Adjust the offset of arguments - offset += sizeof(size_t); - } - else { - // These objects have forced data size to uint64_t - if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { - offset = amd::alignUp(offset, sizeof(uint64_t)); - desc.offset_ = offset; - offset += sizeof(uint64_t); - } - else { - offset = amd::alignUp(offset, alignment); - desc.offset_ = offset; - offset += size; - } - } - // Update read only flag - desc.info_.readOnly_ = GetReadOnlyOCL(aclArg); - - params.push_back(desc); - - if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { - flags_.imageEna_ = true; - if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { - flags_.imageWriteEna_ = true; - } - } - } - // Save the number of OCL arguments - uint32_t numParams = params.size(); - // Append the hidden arguments to the OCL arguments - params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); - createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); -} -#endif - bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { if (!sym) { return false; @@ -796,7 +148,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi (prog->isNull() ? 1 : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_) { - hsa_ = true; + flags_.hsa_ = true; } HSAILKernel::~HSAILKernel() { @@ -849,7 +201,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { return false; } // Set the argList - initArgList(reinterpret_cast(aclArgList)); + InitParameters(reinterpret_cast(aclArgList), argsBufferSize()); delete[] aclArgList; size_t sizeOfWorkGroupSize; @@ -1324,7 +676,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { } // Set the argList - initArgList(*kernelMD); + InitParameters(*kernelMD, argsBufferSize()); if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index a22ae187bb..d2c855fe48 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel { //! Returns spill reg size per workitem int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); } - //! Returns TRUE if kernel uses dynamic parallelism - bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } - - //! Returns TRUE if kernel is internal kernel - bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } - //! Finds local workgroup size void findLocalWorkSize(size_t workDim, //!< Work dimension const amd::NDRange& gblWorkSize, //!< Global work size @@ -124,8 +118,6 @@ class HSAILKernel : public device::Kernel { return waveLimiter_.getWavesPerSH(vdev); }; - const std::unordered_map& patch() const { return patchReferences_; } - private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -137,10 +129,6 @@ class HSAILKernel : public device::Kernel { //! Creates AQL kernel HW info bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym); - //! Initializes the abstraction layer kernel parameters - void initArgList(const aclArgData* aclArg //!< List of ACL arguments - ); - //! Initializes Hsail Printf metadata and info void initPrintf(const aclPrintfFmt* aclPrintf //!< List of ACL printfs ); @@ -151,22 +139,10 @@ class HSAILKernel : public device::Kernel { const HSAILProgram& prog_; //!< Reference to the parent program std::vector printf_; //!< Format strings for GPU printf support uint index_; //!< Kernel index in the program - std::unordered_map patchReferences_; //!< Patch table for references uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code - union Flags { - struct { - uint imageEna_ : 1; //!< Kernel uses images - uint imageWriteEna_ : 1; //!< Kernel uses image writes - uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled - uint internalKernel_ : 1; //!< True: internal kernel - }; - uint value_; - Flags() : value_(0) {} - } flags_; - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; @@ -182,9 +158,6 @@ class LightningKernel : public HSAILKernel { //! Initializes the metadata required for this kernel, bool init(amd::hsa::loader::Symbol* symbol); - //! Initializes Hsail Argument metadata and info for LC - void initArgList(const KernelMD& kernelMD); - //! Initializes HSAIL Printf metadata and info for LC void initPrintf(const std::vector& printfInfoStrings); }; diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index 47268bc612..587ff9e8b1 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -11,749 +11,6 @@ namespace roc { -#if defined(WITH_LIGHTNING_COMPILER) - -using llvm::AMDGPU::HSAMD::AccessQualifier; -using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; -using llvm::AMDGPU::HSAMD::ValueKind; -using llvm::AMDGPU::HSAMD::ValueType; - -static inline ROC_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) { - switch (lcArg.mValueKind) { - case ValueKind::GlobalBuffer: - case ValueKind::DynamicSharedPointer: - case ValueKind::Pipe: - return ROC_ARGTYPE_POINTER; - case ValueKind::ByValue: - return ROC_ARGTYPE_VALUE; - case ValueKind::Image: - return ROC_ARGTYPE_IMAGE; - case ValueKind::Sampler: - return ROC_ARGTYPE_SAMPLER; - case ValueKind::Queue: - return ROC_ARGTYPE_QUEUE; - case ValueKind::HiddenGlobalOffsetX: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; - case ValueKind::HiddenGlobalOffsetY: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; - case ValueKind::HiddenGlobalOffsetZ: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; - case ValueKind::HiddenPrintfBuffer: - return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; - case ValueKind::HiddenDefaultQueue: - return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; - case ValueKind::HiddenCompletionAction: - return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; - case ValueKind::HiddenNone: - return ROC_ARGTYPE_HIDDEN_NONE; - default: - return ROC_ARGTYPE_ERROR; - } -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline ROC_ARG_TYPE GetKernelArgType(const aclArgData* argInfo) { - if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { - if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; - } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; - } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; - } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { - return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; - } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { - return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; - } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { - return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; - } - return ROC_ARGTYPE_HIDDEN_NONE; - } - - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return ROC_ARGTYPE_POINTER; - case ARG_TYPE_VALUE: - return (argInfo->arg.value.data == DATATYPE_struct) ? ROC_ARGTYPE_REFERENCE - : ROC_ARGTYPE_VALUE; - case ARG_TYPE_IMAGE: - return ROC_ARGTYPE_IMAGE; - case ARG_TYPE_SAMPLER: - return ROC_ARGTYPE_SAMPLER; - case ARG_TYPE_QUEUE: - return ROC_ARGTYPE_QUEUE; - case ARG_TYPE_ERROR: - default: - return ROC_ARGTYPE_ERROR; - } -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; } -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline size_t GetKernelArgAlignment(const aclArgData* argInfo) { - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return sizeof(void*); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - return 1; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8; - case DATATYPE_struct: - return 128; - case DATATYPE_ERROR: - default: - return -1; - } - case ARG_TYPE_IMAGE: - return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: - return sizeof(cl_sampler); - default: - return -1; - } -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - uint32_t align = lcArg.mPointeeAlign; - if (align == 0) { - LogWarning("Missing DynamicSharedPointer alignment"); - align = 128; /* worst case alignment */ - ; - } - return align; - } - return 1; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline size_t GetKernelArgPointeeAlignment(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_POINTER) { - return argInfo->arg.pointer.align; - } - return 1; -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return ROC_ACCESS_TYPE_RO; - case AccessQualifier::WriteOnly: - return ROC_ACCESS_TYPE_WO; - case AccessQualifier::ReadWrite: - default: - return ROC_ACCESS_TYPE_RW; - } - } - return ROC_ACCESS_TYPE_NONE; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const aclArgData* argInfo) { - aclAccessType accessType; - - if (argInfo->type == ARG_TYPE_POINTER) { - accessType = argInfo->arg.pointer.type; - } else if (argInfo->type == ARG_TYPE_IMAGE) { - accessType = argInfo->arg.image.type; - } else { - return ROC_ACCESS_TYPE_NONE; - } - if (accessType == ACCESS_TYPE_RO) { - return ROC_ACCESS_TYPE_RO; - } else if (accessType == ACCESS_TYPE_WO) { - return ROC_ACCESS_TYPE_WO; - } - - return ROC_ACCESS_TYPE_RW; -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) { - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - return ROC_ADDRESS_LOCAL; - } else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { - if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global || lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) { - return ROC_ADDRESS_GLOBAL; - } else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { - return ROC_ADDRESS_CONSTANT; - } - LogError("Unsupported address type"); - return ROC_ADDRESS_ERROR; - } else if (lcArg.mValueKind == ValueKind::Image || - lcArg.mValueKind == ValueKind::Sampler || - lcArg.mValueKind == ValueKind::Pipe) { - return ROC_ADDRESS_GLOBAL; - } - return ROC_ADDRESS_ERROR; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo) { - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT_EMU: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT: - return ROC_ADDRESS_CONSTANT; - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return ROC_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return ROC_ADDRESS_LOCAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return ROC_ADDRESS_ERROR; - } - } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) { - return ROC_ADDRESS_GLOBAL; - } - return ROC_ADDRESS_ERROR; -} - -inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) { - switch (arg->type_){ - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: - return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: - return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: - return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; - case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: - return amd::KernelParameterDescriptor::HiddenPrintfBuffer; - case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: - return amd::KernelParameterDescriptor::HiddenDefaultQueue; - case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: - return amd::KernelParameterDescriptor::HiddenCompletionAction; - case ROC_ARGTYPE_POINTER: - return amd::KernelParameterDescriptor::MemoryObject; - case ROC_ARGTYPE_IMAGE: - return amd::KernelParameterDescriptor::ImageObject; - case ROC_ARGTYPE_REFERENCE: - return amd::KernelParameterDescriptor::ReferenceObject; - case ROC_ARGTYPE_VALUE: - return amd::KernelParameterDescriptor::ValueObject; - case ROC_ARGTYPE_SAMPLER: - return amd::KernelParameterDescriptor::SamplerObject; - case ROC_ARGTYPE_QUEUE: - return amd::KernelParameterDescriptor::QueueObject; - default: - return amd::KernelParameterDescriptor::HiddenNone; - } -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) { - aclArgDataType dataType; - - if (lcArg.mValueKind != ValueKind::ByValue) { - return ROC_DATATYPE_ERROR; - } - - switch (lcArg.mValueType) { - case ValueType::I8: - return ROC_DATATYPE_S8; - case ValueType::I16: - return ROC_DATATYPE_S16; - case ValueType::I32: - return ROC_DATATYPE_S32; - case ValueType::I64: - return ROC_DATATYPE_S64; - case ValueType::U8: - return ROC_DATATYPE_U8; - case ValueType::U16: - return ROC_DATATYPE_U16; - case ValueType::U32: - return ROC_DATATYPE_U32; - case ValueType::U64: - return ROC_DATATYPE_U64; - case ValueType::F16: - return ROC_DATATYPE_F16; - case ValueType::F32: - return ROC_DATATYPE_F32; - case ValueType::F64: - return ROC_DATATYPE_F64; - case ValueType::Struct: - return ROC_DATATYPE_STRUCT; - default: - return ROC_DATATYPE_ERROR; - } -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -/* f16 returns f32 - workaround due to comp lib */ -static inline ROC_DATA_TYPE GetKernelDataType(const aclArgData* argInfo) { - aclArgDataType dataType; - - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } else { - return ROC_DATATYPE_ERROR; - } - switch (dataType) { - case DATATYPE_i1: - return ROC_DATATYPE_B1; - case DATATYPE_i8: - return ROC_DATATYPE_S8; - case DATATYPE_i16: - return ROC_DATATYPE_S16; - case DATATYPE_i32: - return ROC_DATATYPE_S32; - case DATATYPE_i64: - return ROC_DATATYPE_S64; - case DATATYPE_u8: - return ROC_DATATYPE_U8; - case DATATYPE_u16: - return ROC_DATATYPE_U16; - case DATATYPE_u32: - return ROC_DATATYPE_U32; - case DATATYPE_u64: - return ROC_DATATYPE_U64; - case DATATYPE_f16: - return ROC_DATATYPE_F32; - case DATATYPE_f32: - return ROC_DATATYPE_F32; - case DATATYPE_f64: - return ROC_DATATYPE_F64; - case DATATYPE_struct: - return ROC_DATATYPE_STRUCT; - case DATATYPE_opaque: - return ROC_DATATYPE_OPAQUE; - case DATATYPE_ERROR: - default: - return ROC_DATATYPE_ERROR; - } -} - -static inline int GetKernelArgSize(const aclArgData* argInfo) { - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return sizeof(void*); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: - return -1; - } - case ARG_TYPE_IMAGE: - return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: - return sizeof(cl_sampler); - default: - return -1; - } -} - -static inline clk_value_type_t GetOclType(const Kernel::Argument* arg) { - static const clk_value_type_t ClkValueMapType[6][6] = { - {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, - {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16}, - {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16}, - {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16}, - {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16}, - {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16}, - }; - - uint sizeType; - uint numElements; - if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) { - return T_POINTER; - } else if (arg->type_ == ROC_ARGTYPE_VALUE || arg->type_ == ROC_ARGTYPE_REFERENCE) { - switch (arg->dataType_) { - case ROC_DATATYPE_S8: - case ROC_DATATYPE_U8: - sizeType = 0; - numElements = arg->size_; - break; - case ROC_DATATYPE_S16: - case ROC_DATATYPE_U16: - sizeType = 1; - numElements = arg->size_ / 2; - break; - case ROC_DATATYPE_S32: - case ROC_DATATYPE_U32: - sizeType = 2; - numElements = arg->size_ / 4; - break; - case ROC_DATATYPE_S64: - case ROC_DATATYPE_U64: - sizeType = 3; - numElements = arg->size_ / 8; - break; - case ROC_DATATYPE_F16: - sizeType = 4; - numElements = arg->size_ / 2; - break; - case ROC_DATATYPE_F32: - sizeType = 4; - numElements = arg->size_ / 4; - break; - case ROC_DATATYPE_F64: - sizeType = 5; - numElements = arg->size_ / 8; - break; - default: - return T_VOID; - } - - switch (numElements) { - case 1: - return ClkValueMapType[sizeType][0]; - case 2: - return ClkValueMapType[sizeType][1]; - case 3: - return ClkValueMapType[sizeType][2]; - case 4: - return ClkValueMapType[sizeType][3]; - case 8: - return ClkValueMapType[sizeType][4]; - case 16: - return ClkValueMapType[sizeType][5]; - default: - return T_VOID; - } - } else if (arg->type_ == ROC_ARGTYPE_SAMPLER) { - return T_SAMPLER; - } else if (arg->type_ == ROC_ARGTYPE_QUEUE) { - return T_QUEUE; - } else { - return T_VOID; - } -} - -static inline cl_kernel_arg_address_qualifier GetOclAddrQual(const Kernel::Argument* arg) { - if (arg->type_ == ROC_ARGTYPE_POINTER) { - switch (arg->addrQual_) { - case ROC_ADDRESS_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case ROC_ADDRESS_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case ROC_ADDRESS_LOCAL: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } else if (arg->type_ == ROC_ARGTYPE_IMAGE) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - // default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} - -static inline cl_kernel_arg_access_qualifier GetOclAccessQual(const Kernel::Argument* arg) { - if (arg->type_ == ROC_ARGTYPE_IMAGE) { - switch (arg->access_) { - case ROC_ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ROC_ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ROC_ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} - -#if defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) { - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - if (lcArg.mIsVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (lcArg.mIsRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (lcArg.mIsConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - } - else if (lcArg.mIsPipe) { - assert(lcArg.mValueKind == ValueKind::Pipe); - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - return rv; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) { - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } - } - return rv; -} - -#if defined(WITH_COMPILER_LIB) -void HSAILKernel::initArguments(const aclArgData* aclArg) { - device::Kernel::parameters_t params; - device::Kernel::parameters_t hiddenParams; - size_t offsetStruct = KernargSegmentByteSize(); - - // Iterate through the arguments and insert into parameterList - for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) { - // Initialize HSAIL kernel argument - Kernel::Argument* arg = new Kernel::Argument; - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetKernelArgSize(aclArg); - arg->type_ = GetKernelArgType(aclArg); - arg->addrQual_ = GetKernelAddrQual(aclArg); - arg->dataType_ = GetKernelDataType(aclArg); - arg->alignment_ = GetKernelArgAlignment(aclArg); - arg->access_ = GetKernelArgAccessType(aclArg); - arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(aclArg); - - bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || - arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || - arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || - arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER || - arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE || - arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; - - arg->index_ = isHidden ? uint(-1) : params.size(); - hsailArgList_.push_back(arg); - - amd::KernelParameterDescriptor desc; - - // Allocate the hidden arguments, but abstraction layer will skip them - if (isHidden) { - offset = amd::alignUp(offset, arg->alignment_); - desc.offset_ = offset; - desc.size_ = arg->size_; - offset += arg->size_; - desc.info_.oclObject_ = GetOclArgumentType(arg); - hiddenParams.push_back(desc); - continue; - } - - desc.name_ = arg->name_.c_str(); - desc.type_ = GetOclType(arg); - desc.addressQualifier_ = GetOclAddrQual(arg); - desc.accessQualifier_ = GetOclAccessQual(arg); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = arg->typeName_.c_str(); - desc.info_.oclObject_ = GetOclArgumentType(arg); - desc.info_.arrayIndex_ = arg->pointeeAlignment_; - - // set image related flags - if (arg->type_ == ROC_ARGTYPE_IMAGE) { - flags_.imageEnable_ = true; - if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY || - desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) { - flags_.imageWrite_ = true; - } - } - desc.size_ = arg->size_; - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the parameters as they are allocated in memory - size_t size = desc.size_; - - // Check if HSAIL expects data by reference and allocate it behind - if (arg->type_ == ROC_ARGTYPE_REFERENCE) { - desc.offset_ = offsetStruct; - // Align the offset reference - offset = amd::alignUp(offset, sizeof(size_t)); - patchReferences_.insert({desc.offset_, offset}); - offsetStruct += size; - // Adjust the offset of arguments - offset += sizeof(size_t); - } - else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { - // These objects have forced data size to uint64_t - offset = amd::alignUp(offset, sizeof(uint64_t)); - desc.offset_ = offset; - offset += sizeof(uint64_t); - } else { - offset = amd::alignUp(offset, arg->alignment_); - desc.offset_ = offset; - offset += size; - } - - // Update read only flag - desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false; - - params.push_back(desc); - } - - // Save the number of OCL arguments - uint32_t numParams = params.size(); - // Append the hidden arguments to the OCL arguments - params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); - createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); -} -#endif // defined(WITH_COMPILER_LIB) - -#if defined(WITH_LIGHTNING_COMPILER) -void LightningKernel::initArguments(const KernelMD& kernelMD) { - device::Kernel::parameters_t params; - device::Kernel::parameters_t hiddenParams; - size_t offsetStruct = KernargSegmentByteSize(); - - size_t offset = 0; - - for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { - const KernelArgMD& lcArg = kernelMD.mArgs[i]; - - // Initialize HSAIL kernel argument - Kernel::Argument* arg = new Kernel::Argument; - arg->name_ = lcArg.mName; - arg->typeName_ = lcArg.mTypeName; - arg->size_ = lcArg.mSize; - arg->type_ = GetKernelArgType(lcArg); - arg->addrQual_ = GetKernelAddrQual(lcArg); - arg->dataType_ = GetKernelDataType(lcArg); - arg->alignment_ = GetKernelArgAlignment(lcArg); - arg->access_ = GetKernelArgAccessType(lcArg); - arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); - - bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || - arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || - arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || - arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER || - arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE || - arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; - - arg->index_ = isHidden ? uint(-1) : params.size(); - hsailArgList_.push_back(arg); - - // Initialize Device kernel parameters - amd::KernelParameterDescriptor desc; - - if (isHidden) { - offset = amd::alignUp(offset, arg->alignment_); - desc.offset_ = offset; - desc.size_ = arg->size_; - offset += arg->size_; - desc.info_.oclObject_ = GetOclArgumentType(arg); - hiddenParams.push_back(desc); - continue; - } - - desc.name_ = lcArg.mName.c_str(); - desc.type_ = GetOclType(arg); - desc.addressQualifier_ = GetOclAddrQual(arg); - desc.accessQualifier_ = GetOclAccessQual(arg); - desc.typeQualifier_ = GetOclTypeQual(lcArg); - desc.typeName_ = lcArg.mTypeName.c_str(); - desc.info_.oclObject_ = GetOclArgumentType(arg); - desc.info_.arrayIndex_ = arg->pointeeAlignment_; - - // set image related flags - if (arg->type_ == ROC_ARGTYPE_IMAGE) { - flags_.imageEnable_ = true; - if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY || - desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) { - flags_.imageWrite_ = true; - } - } - - desc.size_ = arg->size_; - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the parameters as they are allocated in memory - size_t size = desc.size_; - - // Check if HSAIL expects data by reference and allocate it behind - if (arg->type_ == ROC_ARGTYPE_REFERENCE) { - desc.offset_ = offsetStruct; - // Align the offset reference - offset = amd::alignUp(offset, sizeof(size_t)); - patchReferences_.insert({desc.offset_, offset}); - offsetStruct += size; - // Adjust the offset of arguments - offset += sizeof(size_t); - } - else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || - (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { - // These objects have forced data size to uint64_t - offset = amd::alignUp(offset, sizeof(uint64_t)); - desc.offset_ = offset; - offset += sizeof(uint64_t); - } else { - offset = amd::alignUp(offset, arg->alignment_); - desc.offset_ = offset; - offset += size; - } - - // Update read only flag - desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false; - - params.push_back(desc); - } - - // Save the number of OCL arguments - uint32_t numParams = params.size(); - // Append the hidden arguments to the OCL arguments - params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); - createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); -} -#endif // defined(WITH_LIGHTNING_COMPILER) - Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, const uint32_t workgroupGroupSegmentByteSize, const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, @@ -787,7 +44,7 @@ bool LightningKernel::init() { if (kernelMD == nullptr) { return false; } - initArguments(*kernelMD); + InitParameters(*kernelMD, KernargSegmentByteSize()); // Set the workgroup information for the kernel workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; @@ -907,7 +164,7 @@ bool HSAILKernel::init() { } // Set the argList - initArguments((const aclArgData*)argList.get()); + InitParameters((const aclArgData*)argList.get(), KernargSegmentByteSize()); // Set the workgroup information for the kernel memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); @@ -1151,11 +408,6 @@ void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) { #endif // defined(WITH_COMPILER_LIB) Kernel::~Kernel() { - while (!hsailArgList_.empty()) { - Argument* kernelArgPointer = hsailArgList_.back(); - delete kernelArgPointer; - hsailArgList_.pop_back(); - } } } // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp index 0c1c0f7e18..72b2b962d7 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp @@ -15,77 +15,8 @@ namespace roc { #define MAX_INFO_STRING_LEN 0x40 -enum ROC_ARG_TYPE { - ROC_ARGTYPE_ERROR = 0, - ROC_ARGTYPE_POINTER, - ROC_ARGTYPE_VALUE, - ROC_ARGTYPE_REFERENCE, - ROC_ARGTYPE_IMAGE, - ROC_ARGTYPE_SAMPLER, - ROC_ARGTYPE_QUEUE, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, - ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER, - ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE, - ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION, - ROC_ARGTYPE_HIDDEN_NONE, - ROC_ARGMAX_ARG_TYPES -}; - -enum ROC_ADDRESS_QUALIFIER { - ROC_ADDRESS_ERROR = 0, - ROC_ADDRESS_GLOBAL, - ROC_ADDRESS_CONSTANT, - ROC_ADDRESS_LOCAL, - ROC_MAX_ADDRESS_QUALIFIERS -}; - -enum ROC_DATA_TYPE { - ROC_DATATYPE_ERROR = 0, - ROC_DATATYPE_B1, - ROC_DATATYPE_B8, - ROC_DATATYPE_B16, - ROC_DATATYPE_B32, - ROC_DATATYPE_B64, - ROC_DATATYPE_S8, - ROC_DATATYPE_S16, - ROC_DATATYPE_S32, - ROC_DATATYPE_S64, - ROC_DATATYPE_U8, - ROC_DATATYPE_U16, - ROC_DATATYPE_U32, - ROC_DATATYPE_U64, - ROC_DATATYPE_F16, - ROC_DATATYPE_F32, - ROC_DATATYPE_F64, - ROC_DATATYPE_STRUCT, - ROC_DATATYPE_OPAQUE, - ROC_DATATYPE_MAX_TYPES -}; - -enum ROC_ACCESS_TYPE { - ROC_ACCESS_TYPE_NONE = 0, - ROC_ACCESS_TYPE_RO, - ROC_ACCESS_TYPE_WO, - ROC_ACCESS_TYPE_RW -}; - class Kernel : public device::Kernel { public: - struct Argument { - uint index_; //!< Argument's index in the OCL signature - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint alignment_; //!< Argument's alignment - uint pointeeAlignment_; //!< Alignment of the data pointed to - ROC_ARG_TYPE type_; //!< Type of the argument - ROC_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - ROC_DATA_TYPE dataType_; //!< The type of data - ROC_ACCESS_TYPE access_; //!< Access type for the argument - }; - Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, const uint32_t workgroupGroupSegmentByteSize, const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, @@ -97,7 +28,7 @@ class Kernel : public device::Kernel { const uint32_t workitemPrivateSegmentByteSize() const { return workitemPrivateSegmentByteSize_; } - const uint64_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; } + const uint32_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; } const uint8_t KernargSegmentAlignment() const { return kernargSegmentAlignment_; } @@ -108,63 +39,18 @@ class Kernel : public device::Kernel { const Program* program() const { return static_cast(program_); } - //! Returns the kernel argument list - const std::vector& hsailArgs() const { return hsailArgList_; } - - //! Returns a pointer to the hsail argument at the specified index - Argument* hsailArgAt(size_t index) const { - for (auto arg : hsailArgList_) - if (arg->index_ == index) return arg; - assert(!"Should not reach here"); - return nullptr; - } - //! Return printf info array const std::vector& printfInfo() const { return printf_; } - //! Returns TRUE if kernel uses dynamic parallelism - bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } - - //! set dynamic parallelism flag - void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; } - - //! Return TRUE if kernel is internal blit kernel - bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } - - //! set internal kernel flag - void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } - - //! Return TRUE if kernel uses images - bool imageEnable() const { return (flags_.imageEnable_) ? true : false; } - - //! Return TRUE if kernel wirtes images - bool imageWrite() const { return (flags_.imageWrite_) ? true : false; } - - const std::unordered_map& patch() const { return patchReferences_; } - protected: - union Flags { - struct { - uint internalKernel_ : 1; //!< Is a blit kernel? - uint imageEnable_ : 1; //!< Kernel uses images - uint imageWrite_ : 1; //!< Kernel writes images - uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled - }; - uint value_; - Flags() : value_(0) {} - } flags_; - - - Program* program_; //!< The roc::Program context - std::vector hsailArgList_; //!< Vector list of HSAIL Arguments - uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) + Program* program_; //!< The roc::Program context + uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) const uint32_t workgroupGroupSegmentByteSize_; const uint32_t workitemPrivateSegmentByteSize_; const uint32_t kernargSegmentByteSize_; const uint32_t kernargSegmentAlignment_; size_t kernelDirectiveOffset_; std::vector printf_; - std::unordered_map patchReferences_; //!< Patch table for references }; #if defined(WITH_COMPILER_LIB) @@ -183,9 +69,6 @@ class HSAILKernel : public roc::Kernel { virtual bool init() final; private: - //! Populates hsailArgList_ - void initArguments(const aclArgData* aclArg); - //! Initializes HSAIL Printf metadata and info void initPrintf(const aclPrintfFmt* aclPrintf); }; @@ -206,9 +89,6 @@ class LightningKernel : public roc::Kernel { virtual bool init() final; private: - //! Initializes Hsail Argument metadata and info for LC - void initArguments(const KernelMD& kernelMD); - //! Initializes HSAIL Printf metadata and info for LC void initPrintf(const std::vector& printfInfoStrings); };