From 0eb2d3633288f2fac391eccdcaeec63b81c095d4 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 28 Aug 2018 17:30:29 -0400
Subject: [PATCH] P4 to Git Change 1599157 by gandryey@gera-ocl-lc on
2018/08/28 17:11:04
SWDEV-79445 - OCL generic changes and code clean-up
- Add devkerenle.cpp/hpp files for device::Kernel object
- Move generic code for the arguments setup from the device layer to the abstraction layer
- Update ROCr and PAL paths to utilize the generic logic for the arguments setup
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#226 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#313 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#328 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#130 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#51 edit
[ROCm/clr commit: 5cfeb120ed0f4415dde69c2fedcc70d1519276b8]
---
projects/clr/rocclr/runtime/device/device.cpp | 80 --
projects/clr/rocclr/runtime/device/device.hpp | 184 +----
.../clr/rocclr/runtime/device/devkernel.cpp | 772 ++++++++++++++++++
.../clr/rocclr/runtime/device/devkernel.hpp | 269 ++++++
.../rocclr/runtime/device/gpu/gpukernel.cpp | 3 +-
.../rocclr/runtime/device/gpu/gpukernel.hpp | 17 -
.../rocclr/runtime/device/pal/palkernel.cpp | 654 +--------------
.../rocclr/runtime/device/pal/palkernel.hpp | 27 -
.../rocclr/runtime/device/rocm/rockernel.cpp | 752 +----------------
.../rocclr/runtime/device/rocm/rockernel.hpp | 126 +--
10 files changed, 1052 insertions(+), 1832 deletions(-)
create mode 100644 projects/clr/rocclr/runtime/device/devkernel.cpp
create mode 100644 projects/clr/rocclr/runtime/device/devkernel.hpp
diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index a40cd8c94d..acb52b0142 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -585,86 +585,6 @@ Settings::Settings() {
//!< concurrent Virtual GPUs for default
}
-bool Kernel::createSignature(
- const parameters_t& params, uint32_t numParameters,
- uint32_t version) {
- std::stringstream attribs;
- if (workGroupInfo_.compileSize_[0] != 0) {
- attribs << "reqd_work_group_size(";
- for (size_t i = 0; i < 3; ++i) {
- if (i != 0) {
- attribs << ",";
- }
-
- attribs << workGroupInfo_.compileSize_[i];
- }
- attribs << ")";
- }
- if (workGroupInfo_.compileSizeHint_[0] != 0) {
- attribs << " work_group_size_hint(";
- for (size_t i = 0; i < 3; ++i) {
- if (i != 0) {
- attribs << ",";
- }
-
- attribs << workGroupInfo_.compileSizeHint_[i];
- }
- attribs << ")";
- }
-
- if (!workGroupInfo_.compileVecTypeHint_.empty()) {
- attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")";
- }
-
- // Destroy old signature if it was allocated before
- // (offline devices path)
- delete signature_;
- signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version);
- if (NULL != signature_) {
- return true;
- }
- return false;
-}
-
-Kernel::~Kernel() { delete signature_; }
-
-std::string Kernel::openclMangledName(const std::string& name) {
- const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
- assert(bifSym && "symbol not found");
- return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
-}
-
-void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
- const amd::Coord3D region, uint mapFlags, bool entire,
- amd::Image* baseMip) {
- // Map/Unmap must be serialized.
- amd::ScopedLock lock(owner()->lockMemoryOps());
-
- WriteMapInfo info = {};
- WriteMapInfo* pInfo = &info;
- auto it = writeMapInfo_.find(mapAddress);
- if (it != writeMapInfo_.end()) {
- LogWarning("Double map of the same or overlapped region!");
- pInfo = &it->second;
- }
-
- if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
- pInfo->origin_ = origin;
- pInfo->region_ = region;
- pInfo->entire_ = entire;
- pInfo->unmapWrite_ = true;
- }
- if (mapFlags & CL_MAP_READ) {
- pInfo->unmapRead_ = true;
- }
- pInfo->baseMip_ = baseMip;
-
- // Insert into the map if it's the first region
- if (++pInfo->count_ == 1) {
- writeMapInfo_.insert({mapAddress, info});
- }
-}
-
Program::Program(amd::Device& device)
: device_(device),
type_(TYPE_NONE),
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 067b9ee9b7..5a30609600 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -15,6 +15,7 @@
#include "amdocl/cl_kernel.h"
#include "elf/elf.hpp"
#include "appprofile.hpp"
+#include "devkernel.hpp"
#if defined(WITH_LIGHTNING_COMPILER)
#include "caching/cache.hpp"
@@ -54,7 +55,6 @@ class PerfCounterCommand;
class ReleaseObjectCommand;
class StallQueueCommand;
class Marker;
-class KernelSignature;
class ThreadTraceCommand;
class ThreadTraceMemObjectsCommand;
class SignalCommand;
@@ -74,9 +74,6 @@ namespace option {
class Options;
} // option
-struct ProfilingCallback : public amd::HeapObject {
- virtual void callback(ulong duration, uint32_t waves) = 0;
-};
}
enum OclExtensions {
@@ -176,6 +173,7 @@ static constexpr int AmdVendor = 0x1002;
namespace device {
class ClBinary;
class BlitManager;
+class Kernel;
//! Physical device properties.
struct Info : public amd::EmbeddedObject {
@@ -776,143 +774,6 @@ class Sampler : public amd::HeapObject {
Sampler(const Sampler&);
};
-//! \class DeviceKernel, which will contain the common fields for any device
-class Kernel : public amd::HeapObject {
- public:
- typedef std::vector parameters_t;
-
- //! \struct The device kernel workgroup info structure
- struct WorkGroupInfo : public amd::EmbeddedObject {
- size_t size_; //!< kernel workgroup size
- size_t compileSize_[3]; //!< kernel compiled workgroup size
- cl_ulong localMemSize_; //!< amount of used local memory
- size_t preferredSizeMultiple_; //!< preferred multiple for launch
- cl_ulong privateMemSize_; //!< amount of used private memory
- size_t scratchRegs_; //!< amount of used scratch registers
- size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD
- size_t wavefrontSize_; //!< number of threads per wavefront
- size_t availableGPRs_; //!< GPRs available to the program
- size_t usedGPRs_; //!< GPRs used by the program
- size_t availableSGPRs_; //!< SGPRs available to the program
- size_t usedSGPRs_; //!< SGPRs used by the program
- size_t availableVGPRs_; //!< VGPRs available to the program
- size_t usedVGPRs_; //!< VGPRs used by the program
- size_t availableLDSSize_; //!< available LDS size
- size_t usedLDSSize_; //!< used LDS size
- size_t availableStackSize_; //!< available stack size
- size_t usedStackSize_; //!< used stack size
- size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint
- std::string compileVecTypeHint_; //!< kernel compiled vector type hint
- bool uniformWorkGroupSize_; //!< uniform work group size option
- size_t wavesPerSimdHint_; //!< waves per simd hit
- };
-
- //! Default constructor
- Kernel(const std::string& name) : name_(name), signature_(NULL), hsa_(false) {
- // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
- // Due to std::string not being able to be memset to 0
- workGroupInfo_.size_ = 0;
- workGroupInfo_.compileSize_[0] = 0;
- workGroupInfo_.compileSize_[1] = 0;
- workGroupInfo_.compileSize_[2] = 0;
- workGroupInfo_.localMemSize_ = 0;
- workGroupInfo_.preferredSizeMultiple_ = 0;
- workGroupInfo_.privateMemSize_ = 0;
- workGroupInfo_.scratchRegs_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ = 0;
- workGroupInfo_.wavefrontSize_ = 0;
- workGroupInfo_.availableGPRs_ = 0;
- workGroupInfo_.usedGPRs_ = 0;
- workGroupInfo_.availableSGPRs_ = 0;
- workGroupInfo_.usedSGPRs_ = 0;
- workGroupInfo_.availableVGPRs_ = 0;
- workGroupInfo_.usedVGPRs_ = 0;
- workGroupInfo_.availableLDSSize_ = 0;
- workGroupInfo_.usedLDSSize_ = 0;
- workGroupInfo_.availableStackSize_ = 0;
- workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.compileSizeHint_[0] = 0;
- workGroupInfo_.compileSizeHint_[1] = 0;
- workGroupInfo_.compileSizeHint_[2] = 0;
- workGroupInfo_.compileVecTypeHint_ = "";
- workGroupInfo_.uniformWorkGroupSize_ = false;
- workGroupInfo_.wavesPerSimdHint_ = 0;
- }
-
- //! Default destructor
- virtual ~Kernel();
-
- //! Returns the kernel info structure
- const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; }
-
- //! Returns the kernel signature
- const amd::KernelSignature& signature() const { return *signature_; }
-
- //! Returns the kernel name
- const std::string& name() const { return name_; }
-
- //! Initializes the kernel parameters for the abstraction layer
- bool createSignature(
- const parameters_t& params, uint32_t numParameters,
- uint32_t version);
-
- //! Returns TRUE if it's a HSA kernel
- bool hsa() const { return hsa_; }
-
- void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; }
-
- bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; }
-
- void setReqdWorkGroupSize(size_t x, size_t y, size_t z) {
- workGroupInfo_.compileSize_[0] = x;
- workGroupInfo_.compileSize_[1] = y;
- workGroupInfo_.compileSize_[2] = z;
- }
-
- size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; }
-
- void setWorkGroupSizeHint(size_t x, size_t y, size_t z) {
- workGroupInfo_.compileSizeHint_[0] = x;
- workGroupInfo_.compileSizeHint_[1] = y;
- workGroupInfo_.compileSizeHint_[2] = z;
- }
-
- size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
-
- //! Get profiling callback object
- virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
- return NULL;
- }
-
- virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
- return 0;
- }
-
- void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
-
- void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
-
- void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; }
-
- //! Return the build log
- const std::string& buildLog() const { return buildLog_; }
-
- static std::string openclMangledName(const std::string& name);
-
- protected:
- std::string name_; //!< kernel name
- WorkGroupInfo workGroupInfo_; //!< device kernel info structure
- amd::KernelSignature* signature_; //!< kernel signature
- bool hsa_; //!< True if HSA kernel on GPU
- std::string buildLog_; //!< build log
- private:
- //! Disable default copy constructor
- Kernel(const Kernel&);
-
- //! Disable operator=
- Kernel& operator=(const Kernel&);
-};
-
//! A program object for a specific device.
class Program : public amd::HeapObject {
public:
@@ -1615,47 +1476,6 @@ class Device : public RuntimeObject {
std::map* vaCacheMap_; //!< VA cache map
};
-struct KernelParameterDescriptor {
- enum {
- Value = 0,
- HiddenNone = 1,
- HiddenGlobalOffsetX = 2,
- HiddenGlobalOffsetY = 3,
- HiddenGlobalOffsetZ = 4,
- HiddenPrintfBuffer = 5,
- HiddenDefaultQueue = 6,
- HiddenCompletionAction = 7,
- MemoryObject = 8,
- ReferenceObject = 9,
- ValueObject = 10,
- ImageObject = 11,
- SamplerObject = 12,
- QueueObject = 13
- };
- clk_value_type_t type_; //!< The parameter's type
- size_t offset_; //!< Its offset in the parameter's stack
- size_t size_; //!< Its size in bytes
- union InfoData {
- struct {
- uint32_t oclObject_ : 4; //!< OCL object type
- uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
- uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
- uint32_t defined_ : 1; //!< The argument was defined by the app
- uint32_t reserved_ : 1; //!< reserved
- uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
- };
- uint32_t allValues_;
- InfoData() : allValues_(0) {}
- } info_;
-
- cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier
- cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier
- cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier
-
- std::string name_; //!< The parameter's name in the source
- std::string typeName_; //!< Argument's type name
-};
-
#if defined(WITH_LIGHTNING_COMPILER)
//! Compilation process with cache support.
class CacheCompilation : public amd::HeapObject {
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
new file mode 100644
index 0000000000..16892f2fd0
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -0,0 +1,772 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/runtime.hpp"
+#include "platform/program.hpp"
+#include "devkernel.hpp"
+#include "utils/macros.hpp"
+#include "utils/options.hpp"
+#include "utils/bif_section_labels.hpp"
+#include "utils/libUtils.h"
+
+#include
+#include
+
+#include "acl.h"
+
+#if defined(WITH_LIGHTNING_COMPILER)
+#include "llvm/Support/AMDGPUMetadata.h"
+
+typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
+namespace device {
+
+bool Kernel::createSignature(
+ const parameters_t& params, uint32_t numParameters,
+ uint32_t version) {
+ std::stringstream attribs;
+ if (workGroupInfo_.compileSize_[0] != 0) {
+ attribs << "reqd_work_group_size(";
+ for (size_t i = 0; i < 3; ++i) {
+ if (i != 0) {
+ attribs << ",";
+ }
+
+ attribs << workGroupInfo_.compileSize_[i];
+ }
+ attribs << ")";
+ }
+ if (workGroupInfo_.compileSizeHint_[0] != 0) {
+ attribs << " work_group_size_hint(";
+ for (size_t i = 0; i < 3; ++i) {
+ if (i != 0) {
+ attribs << ",";
+ }
+
+ attribs << workGroupInfo_.compileSizeHint_[i];
+ }
+ attribs << ")";
+ }
+
+ if (!workGroupInfo_.compileVecTypeHint_.empty()) {
+ attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")";
+ }
+
+ // Destroy old signature if it was allocated before
+ // (offline devices path)
+ delete signature_;
+ signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version);
+ if (NULL != signature_) {
+ return true;
+ }
+ return false;
+}
+
+Kernel::~Kernel() { delete signature_; }
+
+std::string Kernel::openclMangledName(const std::string& name) {
+ const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
+ assert(bifSym && "symbol not found");
+ return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
+}
+
+void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
+ const amd::Coord3D region, uint mapFlags, bool entire,
+ amd::Image* baseMip) {
+ // Map/Unmap must be serialized.
+ amd::ScopedLock lock(owner()->lockMemoryOps());
+
+ WriteMapInfo info = {};
+ WriteMapInfo* pInfo = &info;
+ auto it = writeMapInfo_.find(mapAddress);
+ if (it != writeMapInfo_.end()) {
+ LogWarning("Double map of the same or overlapped region!");
+ pInfo = &it->second;
+ }
+
+ if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
+ pInfo->origin_ = origin;
+ pInfo->region_ = region;
+ pInfo->entire_ = entire;
+ pInfo->unmapWrite_ = true;
+ }
+ if (mapFlags & CL_MAP_READ) {
+ pInfo->unmapRead_ = true;
+ }
+ pInfo->baseMip_ = baseMip;
+
+ // Insert into the map if it's the first region
+ if (++pInfo->count_ == 1) {
+ writeMapInfo_.insert({ mapAddress, info });
+ }
+}
+
+#if defined(WITH_LIGHTNING_COMPILER)
+using llvm::AMDGPU::HSAMD::AccessQualifier;
+using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
+using llvm::AMDGPU::HSAMD::ValueKind;
+using llvm::AMDGPU::HSAMD::ValueType;
+
+static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) {
+ switch (lcArg.mValueKind) {
+ case ValueKind::GlobalBuffer:
+ case ValueKind::DynamicSharedPointer:
+ case ValueKind::Pipe:
+ return amd::KernelParameterDescriptor::MemoryObject;
+ case ValueKind::ByValue:
+ return amd::KernelParameterDescriptor::ValueObject;
+ case ValueKind::Image:
+ return amd::KernelParameterDescriptor::ImageObject;
+ case ValueKind::Sampler:
+ return amd::KernelParameterDescriptor::SamplerObject;
+ case ValueKind::HiddenGlobalOffsetX:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+ case ValueKind::HiddenGlobalOffsetY:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+ case ValueKind::HiddenGlobalOffsetZ:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+ case ValueKind::HiddenPrintfBuffer:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+ case ValueKind::HiddenDefaultQueue:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+ case ValueKind::HiddenCompletionAction:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenCompletionAction;
+ case ValueKind::HiddenNone:
+ default:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenNone;
+ }
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
+ if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
+ *isHidden = true;
+ if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+ }
+ else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+ }
+ else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+ }
+ else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
+ return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+ }
+ else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
+ return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+ }
+ else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
+ return amd::KernelParameterDescriptor::HiddenCompletionAction;
+ }
+ return amd::KernelParameterDescriptor::HiddenNone;
+ }
+ switch (argInfo->type) {
+ case ARG_TYPE_POINTER:
+ return amd::KernelParameterDescriptor::MemoryObject;
+ case ARG_TYPE_QUEUE:
+ return amd::KernelParameterDescriptor::QueueObject;
+ case ARG_TYPE_VALUE:
+ return (argInfo->arg.value.data == DATATYPE_struct) ?
+ amd::KernelParameterDescriptor::ReferenceObject :
+ amd::KernelParameterDescriptor::ValueObject;
+ case ARG_TYPE_IMAGE:
+ return amd::KernelParameterDescriptor::ImageObject;
+ case ARG_TYPE_SAMPLER:
+ return amd::KernelParameterDescriptor::SamplerObject;
+ case ARG_TYPE_ERROR:
+ default:
+ return amd::KernelParameterDescriptor::HiddenNone;
+}
+}
+#endif
+
+static const clk_value_type_t ClkValueMapType[6][6] = {
+ { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
+ { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
+ { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
+ { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
+ { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
+ { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
+};
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
+ uint sizeType;
+ uint numElements;
+
+ if (lcArg.mValueKind != ValueKind::ByValue) {
+ switch (lcArg.mValueKind) {
+ case ValueKind::GlobalBuffer:
+ case ValueKind::DynamicSharedPointer:
+ case ValueKind::Pipe:
+ case ValueKind::Image:
+ return T_POINTER;
+ case ValueKind::Sampler:
+ return T_SAMPLER;
+ default:
+ return T_VOID;
+ }
+ }
+ else {
+ switch (lcArg.mValueType) {
+ case ValueType::I8:
+ case ValueType::U8:
+ sizeType = 0;
+ numElements = size;
+ break;
+ case ValueType::I16:
+ case ValueType::U16:
+ sizeType = 1;
+ numElements = size / 2;
+ break;
+ case ValueType::I32:
+ case ValueType::U32:
+ sizeType = 2;
+ numElements = size / 4;
+ break;
+ case ValueType::I64:
+ case ValueType::U64:
+ sizeType = 3;
+ numElements = size / 8;
+ break;
+ case ValueType::F16:
+ sizeType = 4;
+ numElements = size / 2;
+ break;
+ case ValueType::F32:
+ sizeType = 4;
+ numElements = size / 4;
+ break;
+ case ValueType::F64:
+ sizeType = 5;
+ numElements = size / 8;
+ break;
+ case ValueType::Struct:
+ default:
+ return T_VOID;
+ }
+ switch (numElements) {
+ case 1:
+ return ClkValueMapType[sizeType][0];
+ case 2:
+ return ClkValueMapType[sizeType][1];
+ case 3:
+ return ClkValueMapType[sizeType][2];
+ case 4:
+ return ClkValueMapType[sizeType][3];
+ case 8:
+ return ClkValueMapType[sizeType][4];
+ case 16:
+ return ClkValueMapType[sizeType][5];
+ default:
+ return T_VOID;
+ }
+ }
+ return T_VOID;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
+ uint sizeType;
+ uint numElements;
+ if (argInfo->type == ARG_TYPE_QUEUE) {
+ return T_QUEUE;
+ }
+ else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) {
+ return T_POINTER;
+ }
+ else if (argInfo->type == ARG_TYPE_VALUE) {
+ switch (argInfo->arg.value.data) {
+ case DATATYPE_i8:
+ case DATATYPE_u8:
+ sizeType = 0;
+ numElements = size;
+ break;
+ case DATATYPE_i16:
+ case DATATYPE_u16:
+ sizeType = 1;
+ numElements = size / 2;
+ break;
+ case DATATYPE_i32:
+ case DATATYPE_u32:
+ sizeType = 2;
+ numElements = size / 4;
+ break;
+ case DATATYPE_i64:
+ case DATATYPE_u64:
+ sizeType = 3;
+ numElements = size / 8;
+ break;
+ case DATATYPE_f16:
+ sizeType = 4;
+ numElements = size / 2;
+ break;
+ case DATATYPE_f32:
+ sizeType = 4;
+ numElements = size / 4;
+ break;
+ case DATATYPE_f64:
+ sizeType = 5;
+ numElements = size / 8;
+ break;
+ case DATATYPE_struct:
+ case DATATYPE_opaque:
+ case DATATYPE_ERROR:
+ default:
+ return T_VOID;
+ }
+
+ switch (numElements) {
+ case 1:
+ return ClkValueMapType[sizeType][0];
+ case 2:
+ return ClkValueMapType[sizeType][1];
+ case 3:
+ return ClkValueMapType[sizeType][2];
+ case 4:
+ return ClkValueMapType[sizeType][3];
+ case 8:
+ return ClkValueMapType[sizeType][4];
+ case 16:
+ return ClkValueMapType[sizeType][5];
+ default:
+ return T_VOID;
+ }
+ }
+ else if (argInfo->type == ARG_TYPE_SAMPLER) {
+ return T_SAMPLER;
+ }
+ else {
+ return T_VOID;
+ }
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
+ switch (argInfo->type) {
+ case ARG_TYPE_POINTER:
+ return sizeof(void*);
+ case ARG_TYPE_VALUE:
+ switch (argInfo->arg.value.data) {
+ case DATATYPE_i8:
+ case DATATYPE_u8:
+ return 1;
+ case DATATYPE_u16:
+ case DATATYPE_i16:
+ case DATATYPE_f16:
+ return 2;
+ case DATATYPE_u32:
+ case DATATYPE_i32:
+ case DATATYPE_f32:
+ return 4;
+ case DATATYPE_i64:
+ case DATATYPE_u64:
+ case DATATYPE_f64:
+ return 8;
+ case DATATYPE_struct:
+ return 128;
+ case DATATYPE_ERROR:
+ default:
+ return -1;
+ }
+ case ARG_TYPE_IMAGE:
+ return sizeof(cl_mem);
+ case ARG_TYPE_SAMPLER:
+ return sizeof(cl_sampler);
+ default:
+ return -1;
+ }
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
+ if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
+ uint32_t align = lcArg.mPointeeAlign;
+ if (align == 0) {
+ LogWarning("Missing DynamicSharedPointer alignment");
+ align = 128; /* worst case alignment */
+ }
+ return align;
+ }
+ return 1;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
+ if (argInfo->type == ARG_TYPE_POINTER) {
+ return argInfo->arg.pointer.align;
+ }
+ return 1;
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
+ if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
+ switch (lcArg.mAccQual) {
+ case AccessQualifier::ReadOnly:
+ return true;
+ case AccessQualifier::WriteOnly:
+ case AccessQualifier::ReadWrite:
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
+ if (argInfo->type == ARG_TYPE_POINTER) {
+ return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false;
+ }
+ else if (argInfo->type == ARG_TYPE_IMAGE) {
+ return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false;
+ }
+ return false;
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+inline static int GetArgSizeOCL(const aclArgData* argInfo) {
+ switch (argInfo->type) {
+ case ARG_TYPE_POINTER:
+ return sizeof(void*);
+ case ARG_TYPE_VALUE:
+ switch (argInfo->arg.value.data) {
+ case DATATYPE_i8:
+ case DATATYPE_u8:
+ case DATATYPE_struct:
+ return 1 * argInfo->arg.value.numElements;
+ case DATATYPE_u16:
+ case DATATYPE_i16:
+ case DATATYPE_f16:
+ return 2 * argInfo->arg.value.numElements;
+ case DATATYPE_u32:
+ case DATATYPE_i32:
+ case DATATYPE_f32:
+ return 4 * argInfo->arg.value.numElements;
+ case DATATYPE_i64:
+ case DATATYPE_u64:
+ case DATATYPE_f64:
+ return 8 * argInfo->arg.value.numElements;
+ case DATATYPE_ERROR:
+ default:
+ return -1;
+ }
+ case ARG_TYPE_IMAGE:
+ case ARG_TYPE_SAMPLER:
+ case ARG_TYPE_QUEUE:
+ return sizeof(void*);
+ default:
+ return -1;
+ }
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
+ if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
+ return CL_KERNEL_ARG_ADDRESS_LOCAL;
+ }
+ else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
+ if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global ||
+ lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) {
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ }
+ else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
+ return CL_KERNEL_ARG_ADDRESS_CONSTANT;
+ }
+ LogError("Unsupported address type");
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+ }
+ else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) {
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ }
+ // default for all other cases
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
+ if (argInfo->type == ARG_TYPE_POINTER) {
+ switch (argInfo->arg.pointer.memory) {
+ case PTR_MT_UAV_CONSTANT:
+ case PTR_MT_CONSTANT_EMU:
+ case PTR_MT_CONSTANT:
+ return CL_KERNEL_ARG_ADDRESS_CONSTANT;
+ case PTR_MT_UAV:
+ case PTR_MT_GLOBAL:
+ case PTR_MT_SCRATCH_EMU:
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ case PTR_MT_LDS_EMU:
+ case PTR_MT_LDS:
+ return CL_KERNEL_ARG_ADDRESS_LOCAL;
+ case PTR_MT_ERROR:
+ default:
+ LogError("Unsupported address type");
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+ }
+ }
+ else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) {
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ }
+
+ // default for all other cases
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
+ if (lcArg.mValueKind == ValueKind::Image) {
+ switch (lcArg.mAccQual) {
+ case AccessQualifier::ReadOnly:
+ return CL_KERNEL_ARG_ACCESS_READ_ONLY;
+ case AccessQualifier::WriteOnly:
+ return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+ case AccessQualifier::ReadWrite:
+ default:
+ return CL_KERNEL_ARG_ACCESS_READ_WRITE;
+ }
+ }
+ return CL_KERNEL_ARG_ACCESS_NONE;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
+ if (argInfo->type == ARG_TYPE_IMAGE) {
+ switch (argInfo->arg.image.type) {
+ case ACCESS_TYPE_RO:
+ return CL_KERNEL_ARG_ACCESS_READ_ONLY;
+ case ACCESS_TYPE_WO:
+ return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+ default:
+ return CL_KERNEL_ARG_ACCESS_READ_WRITE;
+ }
+ }
+ return CL_KERNEL_ARG_ACCESS_NONE;
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
+ cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
+ if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
+ lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
+ if (lcArg.mIsVolatile) {
+ rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
+ }
+ if (lcArg.mIsRestrict) {
+ rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
+ }
+ if (lcArg.mIsConst) {
+ rv |= CL_KERNEL_ARG_TYPE_CONST;
+ }
+ }
+ else if (lcArg.mIsPipe) {
+ assert(lcArg.mValueKind == ValueKind::Pipe);
+ rv |= CL_KERNEL_ARG_TYPE_PIPE;
+ }
+ return rv;
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
+ cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
+ if (argInfo->type == ARG_TYPE_POINTER) {
+ if (argInfo->arg.pointer.isVolatile) {
+ rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
+ }
+ if (argInfo->arg.pointer.isRestrict) {
+ rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
+ }
+ if (argInfo->arg.pointer.isPipe) {
+ rv |= CL_KERNEL_ARG_TYPE_PIPE;
+ }
+ if (argInfo->isConst) {
+ rv |= CL_KERNEL_ARG_TYPE_CONST;
+ }
+ switch (argInfo->arg.pointer.memory) {
+ case PTR_MT_CONSTANT:
+ case PTR_MT_UAV_CONSTANT:
+ case PTR_MT_CONSTANT_EMU:
+ rv |= CL_KERNEL_ARG_TYPE_CONST;
+ break;
+ default:
+ break;
+ }
+ }
+ return rv;
+}
+#endif
+
+#if defined(WITH_LIGHTNING_COMPILER)
+void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
+ // Iterate through the arguments and insert into parameterList
+ device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
+ amd::KernelParameterDescriptor desc;
+ size_t offset = 0;
+ size_t offsetStruct = argBufferSize;
+
+ for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
+ const KernelArgMD& lcArg = kernelMD.mArgs[i];
+
+ size_t size = GetArgSizeOCL(lcArg);
+ size_t alignment = GetArgAlignmentOCL(lcArg);
+ bool isHidden = false;
+ desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
+
+ // Allocate the hidden arguments, but abstraction layer will skip them
+ if (isHidden) {
+ offset = amd::alignUp(offset, alignment);
+ desc.offset_ = offset;
+ desc.size_ = size;
+ offset += size;
+ hiddenParams.push_back(desc);
+ continue;
+ }
+
+ desc.name_ = lcArg.mName.c_str();
+ desc.type_ = GetOclTypeOCL(lcArg, size);
+ desc.typeName_ = lcArg.mTypeName.c_str();
+
+ desc.addressQualifier_ = GetOclAddrQualOCL(lcArg);
+ desc.accessQualifier_ = GetOclAccessQualOCL(lcArg);
+ desc.typeQualifier_ = GetOclTypeQualOCL(lcArg);
+ desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg);
+ desc.size_ = size;
+
+ // These objects have forced data size to uint64_t
+ if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ }
+ else {
+ offset = amd::alignUp(offset, alignment);
+ desc.offset_ = offset;
+ offset += size;
+ }
+
+ // Update read only flag
+ desc.info_.readOnly_ = GetReadOnlyOCL(lcArg);
+
+ params.push_back(desc);
+
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+ flags_.imageEna_ = true;
+ if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
+ flags_.imageWriteEna_ = true;
+ }
+ }
+ }
+
+ // Save the number of OCL arguments
+ uint32_t numParams = params.size();
+ // Append the hidden arguments to the OCL arguments
+ params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+ createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
+}
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
+ // Iterate through the arguments and insert into parameterList
+ device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
+ amd::KernelParameterDescriptor desc;
+ size_t offset = 0;
+ size_t offsetStruct = argBufferSize;
+
+ for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
+ size_t size = GetArgSizeOCL(aclArg);
+ size_t alignment = GetArgAlignmentOCL(aclArg);
+ bool isHidden = false;
+ desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden);
+
+ // Allocate the hidden arguments, but abstraction layer will skip them
+ if (isHidden) {
+ offset = amd::alignUp(offset, alignment);
+ desc.offset_ = offset;
+ desc.size_ = size;
+ offset += size;
+ hiddenParams.push_back(desc);
+ continue;
+ }
+
+ desc.name_ = aclArg->argStr;
+ desc.typeName_ = aclArg->typeStr;
+ desc.type_ = GetOclTypeOCL(aclArg, size);
+
+ desc.addressQualifier_ = GetOclAddrQualOCL(aclArg);
+ desc.accessQualifier_ = GetOclAccessQualOCL(aclArg);
+ desc.typeQualifier_ = GetOclTypeQualOCL(aclArg);
+ desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg);
+ desc.size_ = size;
+
+ // Check if HSAIL expects data by reference and allocate it behind
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
+ desc.offset_ = offsetStruct;
+ // Align the offset reference
+ offset = amd::alignUp(offset, sizeof(size_t));
+ patchReferences_.insert({ desc.offset_, offset });
+ offsetStruct += size;
+ // Adjust the offset of arguments
+ offset += sizeof(size_t);
+ }
+ else {
+ // These objects have forced data size to uint64_t
+ if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ }
+ else {
+ offset = amd::alignUp(offset, alignment);
+ desc.offset_ = offset;
+ offset += size;
+ }
+ }
+ // Update read only flag
+ desc.info_.readOnly_ = GetReadOnlyOCL(aclArg);
+
+ params.push_back(desc);
+
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+ flags_.imageEna_ = true;
+ if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
+ flags_.imageWriteEna_ = true;
+ }
+ }
+ }
+ // Save the number of OCL arguments
+ uint32_t numParams = params.size();
+ // Append the hidden arguments to the OCL arguments
+ params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+ createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
+}
+#endif
+
+}
\ No newline at end of file
diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp
new file mode 100644
index 0000000000..08066cd15b
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/devkernel.hpp
@@ -0,0 +1,269 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+#pragma once
+
+#include "include/aclTypes.h"
+#include "platform/context.hpp"
+#include "platform/object.hpp"
+#include "platform/memory.hpp"
+
+#if defined(WITH_LIGHTNING_COMPILER)
+namespace llvm {
+ namespace AMDGPU {
+ namespace HSAMD {
+ namespace Kernel {
+ struct Metadata;
+}}}}
+typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
+namespace amd {
+ namespace hsa {
+ namespace loader {
+ class Symbol;
+ } // loader
+ namespace code {
+ namespace Kernel {
+ class Metadata;
+ } // Kernel
+ } // code
+ } // hsa
+} // amd
+
+namespace amd {
+
+class Device;
+class KernelSignature;
+
+struct ProfilingCallback : public amd::HeapObject {
+ virtual void callback(ulong duration, uint32_t waves) = 0;
+};
+
+struct KernelParameterDescriptor {
+ enum {
+ Value = 0,
+ HiddenNone = 1,
+ HiddenGlobalOffsetX = 2,
+ HiddenGlobalOffsetY = 3,
+ HiddenGlobalOffsetZ = 4,
+ HiddenPrintfBuffer = 5,
+ HiddenDefaultQueue = 6,
+ HiddenCompletionAction = 7,
+ MemoryObject = 8,
+ ReferenceObject = 9,
+ ValueObject = 10,
+ ImageObject = 11,
+ SamplerObject = 12,
+ QueueObject = 13
+ };
+ clk_value_type_t type_; //!< The parameter's type
+ size_t offset_; //!< Its offset in the parameter's stack
+ size_t size_; //!< Its size in bytes
+ union InfoData {
+ struct {
+ uint32_t oclObject_ : 4; //!< OCL object type
+ uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
+ uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
+ uint32_t defined_ : 1; //!< The argument was defined by the app
+ uint32_t reserved_ : 1; //!< reserved
+ uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
+ };
+ uint32_t allValues_;
+ InfoData() : allValues_(0) {}
+ } info_;
+
+ cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier
+ cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier
+ cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier
+
+ std::string name_; //!< The parameter's name in the source
+ std::string typeName_; //!< Argument's type name
+};
+
+}
+
+namespace device {
+
+//! \class DeviceKernel, which will contain the common fields for any device
+class Kernel : public amd::HeapObject {
+ public:
+ typedef std::vector parameters_t;
+
+ //! \struct The device kernel workgroup info structure
+ struct WorkGroupInfo : public amd::EmbeddedObject {
+ size_t size_; //!< kernel workgroup size
+ size_t compileSize_[3]; //!< kernel compiled workgroup size
+ cl_ulong localMemSize_; //!< amount of used local memory
+ size_t preferredSizeMultiple_; //!< preferred multiple for launch
+ cl_ulong privateMemSize_; //!< amount of used private memory
+ size_t scratchRegs_; //!< amount of used scratch registers
+ size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD
+ size_t wavefrontSize_; //!< number of threads per wavefront
+ size_t availableGPRs_; //!< GPRs available to the program
+ size_t usedGPRs_; //!< GPRs used by the program
+ size_t availableSGPRs_; //!< SGPRs available to the program
+ size_t usedSGPRs_; //!< SGPRs used by the program
+ size_t availableVGPRs_; //!< VGPRs available to the program
+ size_t usedVGPRs_; //!< VGPRs used by the program
+ size_t availableLDSSize_; //!< available LDS size
+ size_t usedLDSSize_; //!< used LDS size
+ size_t availableStackSize_; //!< available stack size
+ size_t usedStackSize_; //!< used stack size
+ size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint
+ std::string compileVecTypeHint_; //!< kernel compiled vector type hint
+ bool uniformWorkGroupSize_; //!< uniform work group size option
+ size_t wavesPerSimdHint_; //!< waves per simd hit
+ };
+
+ //! Default constructor
+ Kernel(const std::string& name) : name_(name), signature_(NULL) {
+ // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
+ // Due to std::string not being able to be memset to 0
+ workGroupInfo_.size_ = 0;
+ workGroupInfo_.compileSize_[0] = 0;
+ workGroupInfo_.compileSize_[1] = 0;
+ workGroupInfo_.compileSize_[2] = 0;
+ workGroupInfo_.localMemSize_ = 0;
+ workGroupInfo_.preferredSizeMultiple_ = 0;
+ workGroupInfo_.privateMemSize_ = 0;
+ workGroupInfo_.scratchRegs_ = 0;
+ workGroupInfo_.wavefrontPerSIMD_ = 0;
+ workGroupInfo_.wavefrontSize_ = 0;
+ workGroupInfo_.availableGPRs_ = 0;
+ workGroupInfo_.usedGPRs_ = 0;
+ workGroupInfo_.availableSGPRs_ = 0;
+ workGroupInfo_.usedSGPRs_ = 0;
+ workGroupInfo_.availableVGPRs_ = 0;
+ workGroupInfo_.usedVGPRs_ = 0;
+ workGroupInfo_.availableLDSSize_ = 0;
+ workGroupInfo_.usedLDSSize_ = 0;
+ workGroupInfo_.availableStackSize_ = 0;
+ workGroupInfo_.usedStackSize_ = 0;
+ workGroupInfo_.compileSizeHint_[0] = 0;
+ workGroupInfo_.compileSizeHint_[1] = 0;
+ workGroupInfo_.compileSizeHint_[2] = 0;
+ workGroupInfo_.compileVecTypeHint_ = "";
+ workGroupInfo_.uniformWorkGroupSize_ = false;
+ workGroupInfo_.wavesPerSimdHint_ = 0;
+ }
+
+ //! Default destructor
+ virtual ~Kernel();
+
+ //! Returns the kernel info structure
+ const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; }
+
+ //! Returns the kernel signature
+ const amd::KernelSignature& signature() const { return *signature_; }
+
+ //! Returns the kernel name
+ const std::string& name() const { return name_; }
+
+ //! Initializes the kernel parameters for the abstraction layer
+ bool createSignature(
+ const parameters_t& params, uint32_t numParameters,
+ uint32_t version);
+
+ void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; }
+
+ bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; }
+
+ void setReqdWorkGroupSize(size_t x, size_t y, size_t z) {
+ workGroupInfo_.compileSize_[0] = x;
+ workGroupInfo_.compileSize_[1] = y;
+ workGroupInfo_.compileSize_[2] = z;
+ }
+
+ size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; }
+
+ void setWorkGroupSizeHint(size_t x, size_t y, size_t z) {
+ workGroupInfo_.compileSizeHint_[0] = x;
+ workGroupInfo_.compileSizeHint_[1] = y;
+ workGroupInfo_.compileSizeHint_[2] = z;
+ }
+
+ size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
+
+ //! Get profiling callback object
+ virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
+ return NULL;
+ }
+
+ virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
+ return 0;
+ }
+
+ void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
+
+ void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
+
+ void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; }
+
+ //! Return the build log
+ const std::string& buildLog() const { return buildLog_; }
+
+ static std::string openclMangledName(const std::string& name);
+
+ const std::unordered_map& patch() const { return patchReferences_; }
+
+ //! Returns TRUE if kernel uses dynamic parallelism
+ bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
+
+ //! set dynamic parallelism flag
+ void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; }
+
+ //! Returns TRUE if kernel is internal kernel
+ bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
+
+ //! set internal kernel flag
+ void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
+
+ //! Return TRUE if kernel uses images
+ bool imageEnable() const { return (flags_.imageEna_) ? true : false; }
+
+ //! Return TRUE if kernel wirtes images
+ bool imageWrite() const { return (flags_.imageWriteEna_) ? true : false; }
+
+ //! Returns TRUE if it's a HSA kernel
+ bool hsa() const { return (flags_.hsa_) ? true : false; }
+
+ protected:
+ //! Initializes the abstraction layer kernel parameters
+#if defined(WITH_LIGHTNING_COMPILER)
+ void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
+#endif
+#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
+ void InitParameters(
+ const aclArgData* aclArg, //!< List of ACL arguments
+ uint32_t argBufferSize
+ );
+#endif
+ std::string name_; //!< kernel name
+ WorkGroupInfo workGroupInfo_; //!< device kernel info structure
+ amd::KernelSignature* signature_; //!< kernel signature
+ std::string buildLog_; //!< build log
+
+ union Flags {
+ struct {
+ uint imageEna_ : 1; //!< Kernel uses images
+ uint imageWriteEna_ : 1; //!< Kernel uses image writes
+ uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
+ uint internalKernel_ : 1; //!< True: internal kernel
+ uint hsa_ : 1; //!< HSA kernel
+ };
+ uint value_;
+ Flags() : value_(0) {}
+ } flags_;
+
+ private:
+ //! Disable default copy constructor
+ Kernel(const Kernel&);
+
+ //! Disable operator=
+ Kernel& operator=(const Kernel&);
+
+ std::unordered_map patchReferences_; //!< Patch table for references
+};
+
+} // namespace device
\ No newline at end of file
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index d0a2377b61..dd26970033 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -823,7 +823,6 @@ Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& pro
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
// Default wavesPerSimdHint_
workGroupInfo_.wavesPerSimdHint_ = ~0U;
- hsa_ = false;
}
Kernel::~Kernel() {
@@ -3127,7 +3126,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
extraArgumentsNum_(extraArgsNum),
waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) *
dev().hwInfo()->simdPerCU_) {
- hsa_ = true;
+ flags_.hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index 029ecb1946..4eae7a7ef5 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel {
//! Returns spill reg size per workitem
int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; }
- //! Returns TRUE if kernel uses dynamic parallelism
- bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
-
- //! Returns TRUE if kernel is internal kernel
- bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
-
//! Finds local workgroup size
void findLocalWorkSize(size_t workDim, //!< Work dimension
const amd::NDRange& gblWorkSize, //!< Global work size
@@ -895,17 +889,6 @@ class HSAILKernel : public device::Kernel {
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
- union Flags {
- struct {
- uint imageEna_ : 1; //!< Kernel uses images
- uint imageWriteEna_ : 1; //!< Kernel uses image writes
- uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
- uint internalKernel_ : 1; //!< True: internal kernel
- };
- uint value_;
- Flags() : value_(0) {}
- } flags_;
-
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index 7d8f4b2382..5c8ab4e71d 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -21,654 +21,6 @@
namespace pal {
-#if defined(WITH_LIGHTNING_COMPILER)
-using llvm::AMDGPU::HSAMD::AccessQualifier;
-using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
-using llvm::AMDGPU::HSAMD::ValueKind;
-using llvm::AMDGPU::HSAMD::ValueType;
-
-static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) {
- switch (lcArg.mValueKind) {
- case ValueKind::GlobalBuffer:
- case ValueKind::DynamicSharedPointer:
- case ValueKind::Pipe:
- return amd::KernelParameterDescriptor::MemoryObject;
- case ValueKind::ByValue:
- return amd::KernelParameterDescriptor::ValueObject;
- case ValueKind::Image:
- return amd::KernelParameterDescriptor::ImageObject;
- case ValueKind::Sampler:
- return amd::KernelParameterDescriptor::SamplerObject;
- case ValueKind::HiddenGlobalOffsetX:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
- case ValueKind::HiddenGlobalOffsetY:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
- case ValueKind::HiddenGlobalOffsetZ:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
- case ValueKind::HiddenPrintfBuffer:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
- case ValueKind::HiddenDefaultQueue:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenDefaultQueue;
- case ValueKind::HiddenCompletionAction:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenCompletionAction;
- case ValueKind::HiddenNone:
- default:
- *isHidden = true;
- return amd::KernelParameterDescriptor::HiddenNone;
- }
-}
-#else
-static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
- if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
- *isHidden = true;
- if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
- } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
- } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
- } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
- return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
- } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
- return amd::KernelParameterDescriptor::HiddenDefaultQueue;
- } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
- return amd::KernelParameterDescriptor::HiddenCompletionAction;
- }
- return amd::KernelParameterDescriptor::HiddenNone;
- }
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return amd::KernelParameterDescriptor::MemoryObject;
- case ARG_TYPE_QUEUE:
- return amd::KernelParameterDescriptor::QueueObject;
- case ARG_TYPE_VALUE:
- return (argInfo->arg.value.data == DATATYPE_struct) ?
- amd::KernelParameterDescriptor::ReferenceObject :
- amd::KernelParameterDescriptor::ValueObject;
- case ARG_TYPE_IMAGE:
- return amd::KernelParameterDescriptor::ImageObject;
- case ARG_TYPE_SAMPLER:
- return amd::KernelParameterDescriptor::SamplerObject;
- case ARG_TYPE_ERROR:
- default:
- return amd::KernelParameterDescriptor::HiddenNone;
- }
-}
-#endif
-
-static const clk_value_type_t ClkValueMapType[6][6] = {
- { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
- { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
- { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
- { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
- { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
- { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
-};
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
- uint sizeType;
- uint numElements;
-
- if (lcArg.mValueKind != ValueKind::ByValue) {
- switch (lcArg.mValueKind) {
- case ValueKind::GlobalBuffer:
- case ValueKind::DynamicSharedPointer:
- case ValueKind::Pipe:
- case ValueKind::Image:
- return T_POINTER;
- case ValueKind::Sampler:
- return T_SAMPLER;
- default:
- return T_VOID;
- }
- } else {
- switch (lcArg.mValueType) {
- case ValueType::I8:
- case ValueType::U8:
- sizeType = 0;
- numElements = size;
- break;
- case ValueType::I16:
- case ValueType::U16:
- sizeType = 1;
- numElements = size / 2;
- break;
- case ValueType::I32:
- case ValueType::U32:
- sizeType = 2;
- numElements = size / 4;
- break;
- case ValueType::I64:
- case ValueType::U64:
- sizeType = 3;
- numElements = size / 8;
- break;
- case ValueType::F16:
- sizeType = 4;
- numElements = size / 2;
- break;
- case ValueType::F32:
- sizeType = 4;
- numElements = size / 4;
- break;
- case ValueType::F64:
- sizeType = 5;
- numElements = size / 8;
- break;
- case ValueType::Struct:
- default:
- return T_VOID;
- }
- switch (numElements) {
- case 1:
- return ClkValueMapType[sizeType][0];
- case 2:
- return ClkValueMapType[sizeType][1];
- case 3:
- return ClkValueMapType[sizeType][2];
- case 4:
- return ClkValueMapType[sizeType][3];
- case 8:
- return ClkValueMapType[sizeType][4];
- case 16:
- return ClkValueMapType[sizeType][5];
- default:
- return T_VOID;
- }
- }
- return T_VOID;
-}
-#else
-static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
- uint sizeType;
- uint numElements;
- if (argInfo->type == ARG_TYPE_QUEUE) {
- return T_QUEUE;
- }
- else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) {
- return T_POINTER;
- }
- else if (argInfo->type == ARG_TYPE_VALUE) {
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- sizeType = 0;
- numElements = size;
- break;
- case DATATYPE_i16:
- case DATATYPE_u16:
- sizeType = 1;
- numElements = size / 2;
- break;
- case DATATYPE_i32:
- case DATATYPE_u32:
- sizeType = 2;
- numElements = size / 4;
- break;
- case DATATYPE_i64:
- case DATATYPE_u64:
- sizeType = 3;
- numElements = size / 8;
- break;
- case DATATYPE_f16:
- sizeType = 4;
- numElements = size / 2;
- break;
- case DATATYPE_f32:
- sizeType = 4;
- numElements = size / 4;
- break;
- case DATATYPE_f64:
- sizeType = 5;
- numElements = size / 8;
- break;
- case DATATYPE_struct:
- case DATATYPE_opaque:
- case DATATYPE_ERROR:
- default:
- return T_VOID;
- }
-
- switch (numElements) {
- case 1:
- return ClkValueMapType[sizeType][0];
- case 2:
- return ClkValueMapType[sizeType][1];
- case 3:
- return ClkValueMapType[sizeType][2];
- case 4:
- return ClkValueMapType[sizeType][3];
- case 8:
- return ClkValueMapType[sizeType][4];
- case 16:
- return ClkValueMapType[sizeType][5];
- default:
- return T_VOID;
- }
- }
- else if (argInfo->type == ARG_TYPE_SAMPLER) {
- return T_SAMPLER;
- }
- else {
- return T_VOID;
- }
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
-#else
-static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return sizeof(void*);
- case ARG_TYPE_VALUE:
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- return 1;
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16:
- return 2;
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32:
- return 4;
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64:
- return 8;
- case DATATYPE_struct:
- return 128;
- case DATATYPE_ERROR:
- default:
- return -1;
- }
- case ARG_TYPE_IMAGE:
- return sizeof(cl_mem);
- case ARG_TYPE_SAMPLER:
- return sizeof(cl_sampler);
- default:
- return -1;
- }
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- uint32_t align = lcArg.mPointeeAlign;
- if (align == 0) {
- LogWarning("Missing DynamicSharedPointer alignment");
- align = 128; /* worst case alignment */
- }
- return align;
- }
- return 1;
-}
-#else
-static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_POINTER) {
- return argInfo->arg.pointer.align;
- }
- return 1;
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
- if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
- switch (lcArg.mAccQual) {
- case AccessQualifier::ReadOnly:
- return true;
- case AccessQualifier::WriteOnly:
- case AccessQualifier::ReadWrite:
- default:
- return false;
- }
- }
- return false;
-}
-#else
-static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_POINTER) {
- return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false;
- } else if (argInfo->type == ARG_TYPE_IMAGE) {
- return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false;
- }
- return false;
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
-#else
-inline static int GetArgSizeOCL(const aclArgData* argInfo) {
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return sizeof(void*);
- case ARG_TYPE_VALUE:
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- case DATATYPE_struct:
- return 1 * argInfo->arg.value.numElements;
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16:
- return 2 * argInfo->arg.value.numElements;
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32:
- return 4 * argInfo->arg.value.numElements;
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64:
- return 8 * argInfo->arg.value.numElements;
- case DATATYPE_ERROR:
- default:
- return -1;
- }
- case ARG_TYPE_IMAGE:
- case ARG_TYPE_SAMPLER:
- case ARG_TYPE_QUEUE:
- return sizeof(void*);
- default:
- return -1;
- }
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- return CL_KERNEL_ARG_ADDRESS_LOCAL;
- }
- else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
- if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) {
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- }
- else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
- return CL_KERNEL_ARG_ADDRESS_CONSTANT;
- }
- LogError("Unsupported address type");
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
- }
- else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) {
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- }
- // default for all other cases
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
-}
-#else
-static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_POINTER) {
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT_EMU:
- case PTR_MT_CONSTANT:
- return CL_KERNEL_ARG_ADDRESS_CONSTANT;
- case PTR_MT_UAV:
- case PTR_MT_GLOBAL:
- case PTR_MT_SCRATCH_EMU:
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- case PTR_MT_LDS_EMU:
- case PTR_MT_LDS:
- return CL_KERNEL_ARG_ADDRESS_LOCAL;
- case PTR_MT_ERROR:
- default:
- LogError("Unsupported address type");
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
- }
- } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) {
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- }
-
- // default for all other cases
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::Image) {
- switch (lcArg.mAccQual) {
- case AccessQualifier::ReadOnly:
- return CL_KERNEL_ARG_ACCESS_READ_ONLY;
- case AccessQualifier::WriteOnly:
- return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
- case AccessQualifier::ReadWrite:
- default:
- return CL_KERNEL_ARG_ACCESS_READ_WRITE;
- }
- }
- return CL_KERNEL_ARG_ACCESS_NONE;
-}
-#else
-static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_IMAGE) {
- switch (argInfo->arg.image.type) {
- case ACCESS_TYPE_RO:
- return CL_KERNEL_ARG_ACCESS_READ_ONLY;
- case ACCESS_TYPE_WO:
- return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
- default:
- return CL_KERNEL_ARG_ACCESS_READ_WRITE;
- }
- }
- return CL_KERNEL_ARG_ACCESS_NONE;
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
- cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
- lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- if (lcArg.mIsVolatile) {
- rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
- }
- if (lcArg.mIsRestrict) {
- rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
- }
- if (lcArg.mIsConst) {
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- }
- }
- else if (lcArg.mIsPipe) {
- assert(lcArg.mValueKind == ValueKind::Pipe);
- rv |= CL_KERNEL_ARG_TYPE_PIPE;
- }
- return rv;
-}
-#else
-static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
- cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (argInfo->type == ARG_TYPE_POINTER) {
- if (argInfo->arg.pointer.isVolatile) {
- rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
- }
- if (argInfo->arg.pointer.isRestrict) {
- rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
- }
- if (argInfo->arg.pointer.isPipe) {
- rv |= CL_KERNEL_ARG_TYPE_PIPE;
- }
- if (argInfo->isConst) {
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- }
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_CONSTANT:
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT_EMU:
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- break;
- default:
- break;
- }
- }
- return rv;
-}
-#endif
-
-#if defined(WITH_LIGHTNING_COMPILER)
-void LightningKernel::initArgList(const KernelMD& kernelMD) {
- // Iterate through the arguments and insert into parameterList
- device::Kernel::parameters_t params;
- device::Kernel::parameters_t hiddenParams;
- amd::KernelParameterDescriptor desc;
- size_t offset = 0;
- size_t offsetStruct = argsBufferSize();
-
- for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
- const KernelArgMD& lcArg = kernelMD.mArgs[i];
-
- size_t size = GetArgSizeOCL(lcArg);
- size_t alignment = GetArgAlignmentOCL(lcArg);
- bool isHidden = false;
- desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
-
- // Allocate the hidden arguments, but abstraction layer will skip them
- if (isHidden) {
- offset = amd::alignUp(offset, alignment);
- desc.offset_ = offset;
- desc.size_ = size;
- offset += size;
- hiddenParams.push_back(desc);
- continue;
- }
-
- desc.name_ = lcArg.mName.c_str();
- desc.type_ = GetOclTypeOCL(lcArg, size);
- desc.typeName_ = lcArg.mTypeName.c_str();
-
- desc.addressQualifier_ = GetOclAddrQualOCL(lcArg);
- desc.accessQualifier_ = GetOclAccessQualOCL(lcArg);
- desc.typeQualifier_ = GetOclTypeQualOCL(lcArg);
- desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg);
- desc.size_ = size;
-
- // These objects have forced data size to uint64_t
- if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
- offset = amd::alignUp(offset, sizeof(uint64_t));
- desc.offset_ = offset;
- offset += sizeof(uint64_t);
- }
- else {
- offset = amd::alignUp(offset, alignment);
- desc.offset_ = offset;
- offset += size;
- }
-
- // Update read only flag
- desc.info_.readOnly_ = GetReadOnlyOCL(lcArg);
-
- params.push_back(desc);
-
- if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
- flags_.imageEna_ = true;
- if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
- flags_.imageWriteEna_ = true;
- }
- }
- }
-
- // Save the number of OCL arguments
- uint32_t numParams = params.size();
- // Append the hidden arguments to the OCL arguments
- params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
- createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
-}
-#else
-void HSAILKernel::initArgList(const aclArgData* aclArg) {
- // Iterate through the arguments and insert into parameterList
- device::Kernel::parameters_t params;
- device::Kernel::parameters_t hiddenParams;
- amd::KernelParameterDescriptor desc;
- size_t offset = 0;
- size_t offsetStruct = argsBufferSize();
-
- for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
- size_t size = GetArgSizeOCL(aclArg);
- size_t alignment = GetArgAlignmentOCL(aclArg);
- bool isHidden = false;
- desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden);
-
- // Allocate the hidden arguments, but abstraction layer will skip them
- if (isHidden) {
- offset = amd::alignUp(offset, alignment);
- desc.offset_ = offset;
- desc.size_ = size;
- offset += size;
- hiddenParams.push_back(desc);
- continue;
- }
-
- desc.name_ = aclArg->argStr;
- desc.typeName_ = aclArg->typeStr;
- desc.type_ = GetOclTypeOCL(aclArg, size);
-
- desc.addressQualifier_ = GetOclAddrQualOCL(aclArg);
- desc.accessQualifier_ = GetOclAccessQualOCL(aclArg);
- desc.typeQualifier_ = GetOclTypeQualOCL(aclArg);
- desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg);
- desc.size_ = size;
-
- // Check if HSAIL expects data by reference and allocate it behind
- if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
- desc.offset_ = offsetStruct;
- // Align the offset reference
- offset = amd::alignUp(offset, sizeof(size_t));
- patchReferences_.insert({ desc.offset_, offset });
- offsetStruct += size;
- // Adjust the offset of arguments
- offset += sizeof(size_t);
- }
- else {
- // These objects have forced data size to uint64_t
- if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
- offset = amd::alignUp(offset, sizeof(uint64_t));
- desc.offset_ = offset;
- offset += sizeof(uint64_t);
- }
- else {
- offset = amd::alignUp(offset, alignment);
- desc.offset_ = offset;
- offset += size;
- }
- }
- // Update read only flag
- desc.info_.readOnly_ = GetReadOnlyOCL(aclArg);
-
- params.push_back(desc);
-
- if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
- flags_.imageEna_ = true;
- if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
- flags_.imageWriteEna_ = true;
- }
- }
- }
- // Save the number of OCL arguments
- uint32_t numParams = params.size();
- // Append the hidden arguments to the OCL arguments
- params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
- createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
-}
-#endif
-
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
if (!sym) {
return false;
@@ -796,7 +148,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
(prog->isNull() ? 1
: dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
dev().hwInfo()->simdPerCU_) {
- hsa_ = true;
+ flags_.hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
@@ -849,7 +201,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
return false;
}
// Set the argList
- initArgList(reinterpret_cast(aclArgList));
+ InitParameters(reinterpret_cast(aclArgList), argsBufferSize());
delete[] aclArgList;
size_t sizeOfWorkGroupSize;
@@ -1324,7 +676,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
}
// Set the argList
- initArgList(*kernelMD);
+ InitParameters(*kernelMD, argsBufferSize());
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index a22ae187bb..d2c855fe48 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel {
//! Returns spill reg size per workitem
int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); }
- //! Returns TRUE if kernel uses dynamic parallelism
- bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
-
- //! Returns TRUE if kernel is internal kernel
- bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
-
//! Finds local workgroup size
void findLocalWorkSize(size_t workDim, //!< Work dimension
const amd::NDRange& gblWorkSize, //!< Global work size
@@ -124,8 +118,6 @@ class HSAILKernel : public device::Kernel {
return waveLimiter_.getWavesPerSH(vdev);
};
- const std::unordered_map& patch() const { return patchReferences_; }
-
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -137,10 +129,6 @@ class HSAILKernel : public device::Kernel {
//! Creates AQL kernel HW info
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
- //! Initializes the abstraction layer kernel parameters
- void initArgList(const aclArgData* aclArg //!< List of ACL arguments
- );
-
//! Initializes Hsail Printf metadata and info
void initPrintf(const aclPrintfFmt* aclPrintf //!< List of ACL printfs
);
@@ -151,22 +139,10 @@ class HSAILKernel : public device::Kernel {
const HSAILProgram& prog_; //!< Reference to the parent program
std::vector printf_; //!< Format strings for GPU printf support
uint index_; //!< Kernel index in the program
- std::unordered_map patchReferences_; //!< Patch table for references
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
- union Flags {
- struct {
- uint imageEna_ : 1; //!< Kernel uses images
- uint imageWriteEna_ : 1; //!< Kernel uses image writes
- uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
- uint internalKernel_ : 1; //!< True: internal kernel
- };
- uint value_;
- Flags() : value_(0) {}
- } flags_;
-
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
@@ -182,9 +158,6 @@ class LightningKernel : public HSAILKernel {
//! Initializes the metadata required for this kernel,
bool init(amd::hsa::loader::Symbol* symbol);
- //! Initializes Hsail Argument metadata and info for LC
- void initArgList(const KernelMD& kernelMD);
-
//! Initializes HSAIL Printf metadata and info for LC
void initPrintf(const std::vector& printfInfoStrings);
};
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index 47268bc612..587ff9e8b1 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -11,749 +11,6 @@
namespace roc {
-#if defined(WITH_LIGHTNING_COMPILER)
-
-using llvm::AMDGPU::HSAMD::AccessQualifier;
-using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
-using llvm::AMDGPU::HSAMD::ValueKind;
-using llvm::AMDGPU::HSAMD::ValueType;
-
-static inline ROC_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) {
- switch (lcArg.mValueKind) {
- case ValueKind::GlobalBuffer:
- case ValueKind::DynamicSharedPointer:
- case ValueKind::Pipe:
- return ROC_ARGTYPE_POINTER;
- case ValueKind::ByValue:
- return ROC_ARGTYPE_VALUE;
- case ValueKind::Image:
- return ROC_ARGTYPE_IMAGE;
- case ValueKind::Sampler:
- return ROC_ARGTYPE_SAMPLER;
- case ValueKind::Queue:
- return ROC_ARGTYPE_QUEUE;
- case ValueKind::HiddenGlobalOffsetX:
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
- case ValueKind::HiddenGlobalOffsetY:
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
- case ValueKind::HiddenGlobalOffsetZ:
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
- case ValueKind::HiddenPrintfBuffer:
- return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER;
- case ValueKind::HiddenDefaultQueue:
- return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
- case ValueKind::HiddenCompletionAction:
- return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION;
- case ValueKind::HiddenNone:
- return ROC_ARGTYPE_HIDDEN_NONE;
- default:
- return ROC_ARGTYPE_ERROR;
- }
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline ROC_ARG_TYPE GetKernelArgType(const aclArgData* argInfo) {
- if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
- if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
- } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
- } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
- return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
- } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
- return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER;
- } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
- return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
- } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
- return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION;
- }
- return ROC_ARGTYPE_HIDDEN_NONE;
- }
-
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return ROC_ARGTYPE_POINTER;
- case ARG_TYPE_VALUE:
- return (argInfo->arg.value.data == DATATYPE_struct) ? ROC_ARGTYPE_REFERENCE
- : ROC_ARGTYPE_VALUE;
- case ARG_TYPE_IMAGE:
- return ROC_ARGTYPE_IMAGE;
- case ARG_TYPE_SAMPLER:
- return ROC_ARGTYPE_SAMPLER;
- case ARG_TYPE_QUEUE:
- return ROC_ARGTYPE_QUEUE;
- case ARG_TYPE_ERROR:
- default:
- return ROC_ARGTYPE_ERROR;
- }
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; }
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline size_t GetKernelArgAlignment(const aclArgData* argInfo) {
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return sizeof(void*);
- case ARG_TYPE_VALUE:
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- return 1;
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16:
- return 2;
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32:
- return 4;
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64:
- return 8;
- case DATATYPE_struct:
- return 128;
- case DATATYPE_ERROR:
- default:
- return -1;
- }
- case ARG_TYPE_IMAGE:
- return sizeof(cl_mem);
- case ARG_TYPE_SAMPLER:
- return sizeof(cl_sampler);
- default:
- return -1;
- }
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- uint32_t align = lcArg.mPointeeAlign;
- if (align == 0) {
- LogWarning("Missing DynamicSharedPointer alignment");
- align = 128; /* worst case alignment */
- ;
- }
- return align;
- }
- return 1;
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline size_t GetKernelArgPointeeAlignment(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_POINTER) {
- return argInfo->arg.pointer.align;
- }
- return 1;
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) {
- switch (lcArg.mAccQual) {
- case AccessQualifier::ReadOnly:
- return ROC_ACCESS_TYPE_RO;
- case AccessQualifier::WriteOnly:
- return ROC_ACCESS_TYPE_WO;
- case AccessQualifier::ReadWrite:
- default:
- return ROC_ACCESS_TYPE_RW;
- }
- }
- return ROC_ACCESS_TYPE_NONE;
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const aclArgData* argInfo) {
- aclAccessType accessType;
-
- if (argInfo->type == ARG_TYPE_POINTER) {
- accessType = argInfo->arg.pointer.type;
- } else if (argInfo->type == ARG_TYPE_IMAGE) {
- accessType = argInfo->arg.image.type;
- } else {
- return ROC_ACCESS_TYPE_NONE;
- }
- if (accessType == ACCESS_TYPE_RO) {
- return ROC_ACCESS_TYPE_RO;
- } else if (accessType == ACCESS_TYPE_WO) {
- return ROC_ACCESS_TYPE_WO;
- }
-
- return ROC_ACCESS_TYPE_RW;
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) {
- if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- return ROC_ADDRESS_LOCAL;
- } else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
- if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global || lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) {
- return ROC_ADDRESS_GLOBAL;
- } else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
- return ROC_ADDRESS_CONSTANT;
- }
- LogError("Unsupported address type");
- return ROC_ADDRESS_ERROR;
- } else if (lcArg.mValueKind == ValueKind::Image ||
- lcArg.mValueKind == ValueKind::Sampler ||
- lcArg.mValueKind == ValueKind::Pipe) {
- return ROC_ADDRESS_GLOBAL;
- }
- return ROC_ADDRESS_ERROR;
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo) {
- if (argInfo->type == ARG_TYPE_POINTER) {
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_CONSTANT_EMU:
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT:
- return ROC_ADDRESS_CONSTANT;
- case PTR_MT_UAV:
- case PTR_MT_GLOBAL:
- return ROC_ADDRESS_GLOBAL;
- case PTR_MT_LDS_EMU:
- case PTR_MT_LDS:
- return ROC_ADDRESS_LOCAL;
- case PTR_MT_ERROR:
- default:
- LogError("Unsupported address type");
- return ROC_ADDRESS_ERROR;
- }
- } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
- return ROC_ADDRESS_GLOBAL;
- }
- return ROC_ADDRESS_ERROR;
-}
-
-inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
- switch (arg->type_){
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
- return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
- case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
- return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
- case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
- return amd::KernelParameterDescriptor::HiddenDefaultQueue;
- case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
- return amd::KernelParameterDescriptor::HiddenCompletionAction;
- case ROC_ARGTYPE_POINTER:
- return amd::KernelParameterDescriptor::MemoryObject;
- case ROC_ARGTYPE_IMAGE:
- return amd::KernelParameterDescriptor::ImageObject;
- case ROC_ARGTYPE_REFERENCE:
- return amd::KernelParameterDescriptor::ReferenceObject;
- case ROC_ARGTYPE_VALUE:
- return amd::KernelParameterDescriptor::ValueObject;
- case ROC_ARGTYPE_SAMPLER:
- return amd::KernelParameterDescriptor::SamplerObject;
- case ROC_ARGTYPE_QUEUE:
- return amd::KernelParameterDescriptor::QueueObject;
- default:
- return amd::KernelParameterDescriptor::HiddenNone;
- }
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
- aclArgDataType dataType;
-
- if (lcArg.mValueKind != ValueKind::ByValue) {
- return ROC_DATATYPE_ERROR;
- }
-
- switch (lcArg.mValueType) {
- case ValueType::I8:
- return ROC_DATATYPE_S8;
- case ValueType::I16:
- return ROC_DATATYPE_S16;
- case ValueType::I32:
- return ROC_DATATYPE_S32;
- case ValueType::I64:
- return ROC_DATATYPE_S64;
- case ValueType::U8:
- return ROC_DATATYPE_U8;
- case ValueType::U16:
- return ROC_DATATYPE_U16;
- case ValueType::U32:
- return ROC_DATATYPE_U32;
- case ValueType::U64:
- return ROC_DATATYPE_U64;
- case ValueType::F16:
- return ROC_DATATYPE_F16;
- case ValueType::F32:
- return ROC_DATATYPE_F32;
- case ValueType::F64:
- return ROC_DATATYPE_F64;
- case ValueType::Struct:
- return ROC_DATATYPE_STRUCT;
- default:
- return ROC_DATATYPE_ERROR;
- }
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-/* f16 returns f32 - workaround due to comp lib */
-static inline ROC_DATA_TYPE GetKernelDataType(const aclArgData* argInfo) {
- aclArgDataType dataType;
-
- if (argInfo->type == ARG_TYPE_POINTER) {
- dataType = argInfo->arg.pointer.data;
- } else if (argInfo->type == ARG_TYPE_VALUE) {
- dataType = argInfo->arg.value.data;
- } else {
- return ROC_DATATYPE_ERROR;
- }
- switch (dataType) {
- case DATATYPE_i1:
- return ROC_DATATYPE_B1;
- case DATATYPE_i8:
- return ROC_DATATYPE_S8;
- case DATATYPE_i16:
- return ROC_DATATYPE_S16;
- case DATATYPE_i32:
- return ROC_DATATYPE_S32;
- case DATATYPE_i64:
- return ROC_DATATYPE_S64;
- case DATATYPE_u8:
- return ROC_DATATYPE_U8;
- case DATATYPE_u16:
- return ROC_DATATYPE_U16;
- case DATATYPE_u32:
- return ROC_DATATYPE_U32;
- case DATATYPE_u64:
- return ROC_DATATYPE_U64;
- case DATATYPE_f16:
- return ROC_DATATYPE_F32;
- case DATATYPE_f32:
- return ROC_DATATYPE_F32;
- case DATATYPE_f64:
- return ROC_DATATYPE_F64;
- case DATATYPE_struct:
- return ROC_DATATYPE_STRUCT;
- case DATATYPE_opaque:
- return ROC_DATATYPE_OPAQUE;
- case DATATYPE_ERROR:
- default:
- return ROC_DATATYPE_ERROR;
- }
-}
-
-static inline int GetKernelArgSize(const aclArgData* argInfo) {
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return sizeof(void*);
- case ARG_TYPE_VALUE:
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- case DATATYPE_struct:
- return 1 * argInfo->arg.value.numElements;
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16:
- return 2 * argInfo->arg.value.numElements;
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32:
- return 4 * argInfo->arg.value.numElements;
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64:
- return 8 * argInfo->arg.value.numElements;
- case DATATYPE_ERROR:
- default:
- return -1;
- }
- case ARG_TYPE_IMAGE:
- return sizeof(cl_mem);
- case ARG_TYPE_SAMPLER:
- return sizeof(cl_sampler);
- default:
- return -1;
- }
-}
-
-static inline clk_value_type_t GetOclType(const Kernel::Argument* arg) {
- static const clk_value_type_t ClkValueMapType[6][6] = {
- {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
- {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
- {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
- {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
- {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
- {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
- };
-
- uint sizeType;
- uint numElements;
- if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) {
- return T_POINTER;
- } else if (arg->type_ == ROC_ARGTYPE_VALUE || arg->type_ == ROC_ARGTYPE_REFERENCE) {
- switch (arg->dataType_) {
- case ROC_DATATYPE_S8:
- case ROC_DATATYPE_U8:
- sizeType = 0;
- numElements = arg->size_;
- break;
- case ROC_DATATYPE_S16:
- case ROC_DATATYPE_U16:
- sizeType = 1;
- numElements = arg->size_ / 2;
- break;
- case ROC_DATATYPE_S32:
- case ROC_DATATYPE_U32:
- sizeType = 2;
- numElements = arg->size_ / 4;
- break;
- case ROC_DATATYPE_S64:
- case ROC_DATATYPE_U64:
- sizeType = 3;
- numElements = arg->size_ / 8;
- break;
- case ROC_DATATYPE_F16:
- sizeType = 4;
- numElements = arg->size_ / 2;
- break;
- case ROC_DATATYPE_F32:
- sizeType = 4;
- numElements = arg->size_ / 4;
- break;
- case ROC_DATATYPE_F64:
- sizeType = 5;
- numElements = arg->size_ / 8;
- break;
- default:
- return T_VOID;
- }
-
- switch (numElements) {
- case 1:
- return ClkValueMapType[sizeType][0];
- case 2:
- return ClkValueMapType[sizeType][1];
- case 3:
- return ClkValueMapType[sizeType][2];
- case 4:
- return ClkValueMapType[sizeType][3];
- case 8:
- return ClkValueMapType[sizeType][4];
- case 16:
- return ClkValueMapType[sizeType][5];
- default:
- return T_VOID;
- }
- } else if (arg->type_ == ROC_ARGTYPE_SAMPLER) {
- return T_SAMPLER;
- } else if (arg->type_ == ROC_ARGTYPE_QUEUE) {
- return T_QUEUE;
- } else {
- return T_VOID;
- }
-}
-
-static inline cl_kernel_arg_address_qualifier GetOclAddrQual(const Kernel::Argument* arg) {
- if (arg->type_ == ROC_ARGTYPE_POINTER) {
- switch (arg->addrQual_) {
- case ROC_ADDRESS_GLOBAL:
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- case ROC_ADDRESS_CONSTANT:
- return CL_KERNEL_ARG_ADDRESS_CONSTANT;
- case ROC_ADDRESS_LOCAL:
- return CL_KERNEL_ARG_ADDRESS_LOCAL;
- default:
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
- }
- } else if (arg->type_ == ROC_ARGTYPE_IMAGE) {
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- }
- // default for all other cases
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
-}
-
-static inline cl_kernel_arg_access_qualifier GetOclAccessQual(const Kernel::Argument* arg) {
- if (arg->type_ == ROC_ARGTYPE_IMAGE) {
- switch (arg->access_) {
- case ROC_ACCESS_TYPE_RO:
- return CL_KERNEL_ARG_ACCESS_READ_ONLY;
- case ROC_ACCESS_TYPE_WO:
- return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
- case ROC_ACCESS_TYPE_RW:
- return CL_KERNEL_ARG_ACCESS_READ_WRITE;
- default:
- return CL_KERNEL_ARG_ACCESS_NONE;
- }
- }
- return CL_KERNEL_ARG_ACCESS_NONE;
-}
-
-#if defined(WITH_LIGHTNING_COMPILER)
-static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) {
- cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
- lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
- if (lcArg.mIsVolatile) {
- rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
- }
- if (lcArg.mIsRestrict) {
- rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
- }
- if (lcArg.mIsConst) {
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- }
- }
- else if (lcArg.mIsPipe) {
- assert(lcArg.mValueKind == ValueKind::Pipe);
- rv |= CL_KERNEL_ARG_TYPE_PIPE;
- }
- return rv;
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
-static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
- cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (argInfo->type == ARG_TYPE_POINTER) {
- if (argInfo->arg.pointer.isVolatile) {
- rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
- }
- if (argInfo->arg.pointer.isRestrict) {
- rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
- }
- if (argInfo->isConst) {
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- }
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_CONSTANT:
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT_EMU:
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- break;
- default:
- break;
- }
- }
- return rv;
-}
-
-#if defined(WITH_COMPILER_LIB)
-void HSAILKernel::initArguments(const aclArgData* aclArg) {
- device::Kernel::parameters_t params;
- device::Kernel::parameters_t hiddenParams;
- size_t offsetStruct = KernargSegmentByteSize();
-
- // Iterate through the arguments and insert into parameterList
- for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
- // Initialize HSAIL kernel argument
- Kernel::Argument* arg = new Kernel::Argument;
- arg->name_ = aclArg->argStr;
- arg->typeName_ = aclArg->typeStr;
- arg->size_ = GetKernelArgSize(aclArg);
- arg->type_ = GetKernelArgType(aclArg);
- arg->addrQual_ = GetKernelAddrQual(aclArg);
- arg->dataType_ = GetKernelDataType(aclArg);
- arg->alignment_ = GetKernelArgAlignment(aclArg);
- arg->access_ = GetKernelArgAccessType(aclArg);
- arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(aclArg);
-
- bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE;
-
- arg->index_ = isHidden ? uint(-1) : params.size();
- hsailArgList_.push_back(arg);
-
- amd::KernelParameterDescriptor desc;
-
- // Allocate the hidden arguments, but abstraction layer will skip them
- if (isHidden) {
- offset = amd::alignUp(offset, arg->alignment_);
- desc.offset_ = offset;
- desc.size_ = arg->size_;
- offset += arg->size_;
- desc.info_.oclObject_ = GetOclArgumentType(arg);
- hiddenParams.push_back(desc);
- continue;
- }
-
- desc.name_ = arg->name_.c_str();
- desc.type_ = GetOclType(arg);
- desc.addressQualifier_ = GetOclAddrQual(arg);
- desc.accessQualifier_ = GetOclAccessQual(arg);
- desc.typeQualifier_ = GetOclTypeQual(aclArg);
- desc.typeName_ = arg->typeName_.c_str();
- desc.info_.oclObject_ = GetOclArgumentType(arg);
- desc.info_.arrayIndex_ = arg->pointeeAlignment_;
-
- // set image related flags
- if (arg->type_ == ROC_ARGTYPE_IMAGE) {
- flags_.imageEnable_ = true;
- if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
- desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
- flags_.imageWrite_ = true;
- }
- }
- desc.size_ = arg->size_;
-
- // Make offset alignment to match CPU metadata, since
- // in multidevice config abstraction layer has a single signature
- // and CPU sends the parameters as they are allocated in memory
- size_t size = desc.size_;
-
- // Check if HSAIL expects data by reference and allocate it behind
- if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
- desc.offset_ = offsetStruct;
- // Align the offset reference
- offset = amd::alignUp(offset, sizeof(size_t));
- patchReferences_.insert({desc.offset_, offset});
- offsetStruct += size;
- // Adjust the offset of arguments
- offset += sizeof(size_t);
- }
- else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
- // These objects have forced data size to uint64_t
- offset = amd::alignUp(offset, sizeof(uint64_t));
- desc.offset_ = offset;
- offset += sizeof(uint64_t);
- } else {
- offset = amd::alignUp(offset, arg->alignment_);
- desc.offset_ = offset;
- offset += size;
- }
-
- // Update read only flag
- desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
-
- params.push_back(desc);
- }
-
- // Save the number of OCL arguments
- uint32_t numParams = params.size();
- // Append the hidden arguments to the OCL arguments
- params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
- createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
-}
-#endif // defined(WITH_COMPILER_LIB)
-
-#if defined(WITH_LIGHTNING_COMPILER)
-void LightningKernel::initArguments(const KernelMD& kernelMD) {
- device::Kernel::parameters_t params;
- device::Kernel::parameters_t hiddenParams;
- size_t offsetStruct = KernargSegmentByteSize();
-
- size_t offset = 0;
-
- for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
- const KernelArgMD& lcArg = kernelMD.mArgs[i];
-
- // Initialize HSAIL kernel argument
- Kernel::Argument* arg = new Kernel::Argument;
- arg->name_ = lcArg.mName;
- arg->typeName_ = lcArg.mTypeName;
- arg->size_ = lcArg.mSize;
- arg->type_ = GetKernelArgType(lcArg);
- arg->addrQual_ = GetKernelAddrQual(lcArg);
- arg->dataType_ = GetKernelDataType(lcArg);
- arg->alignment_ = GetKernelArgAlignment(lcArg);
- arg->access_ = GetKernelArgAccessType(lcArg);
- arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
-
- bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
- arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE;
-
- arg->index_ = isHidden ? uint(-1) : params.size();
- hsailArgList_.push_back(arg);
-
- // Initialize Device kernel parameters
- amd::KernelParameterDescriptor desc;
-
- if (isHidden) {
- offset = amd::alignUp(offset, arg->alignment_);
- desc.offset_ = offset;
- desc.size_ = arg->size_;
- offset += arg->size_;
- desc.info_.oclObject_ = GetOclArgumentType(arg);
- hiddenParams.push_back(desc);
- continue;
- }
-
- desc.name_ = lcArg.mName.c_str();
- desc.type_ = GetOclType(arg);
- desc.addressQualifier_ = GetOclAddrQual(arg);
- desc.accessQualifier_ = GetOclAccessQual(arg);
- desc.typeQualifier_ = GetOclTypeQual(lcArg);
- desc.typeName_ = lcArg.mTypeName.c_str();
- desc.info_.oclObject_ = GetOclArgumentType(arg);
- desc.info_.arrayIndex_ = arg->pointeeAlignment_;
-
- // set image related flags
- if (arg->type_ == ROC_ARGTYPE_IMAGE) {
- flags_.imageEnable_ = true;
- if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
- desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
- flags_.imageWrite_ = true;
- }
- }
-
- desc.size_ = arg->size_;
-
- // Make offset alignment to match CPU metadata, since
- // in multidevice config abstraction layer has a single signature
- // and CPU sends the parameters as they are allocated in memory
- size_t size = desc.size_;
-
- // Check if HSAIL expects data by reference and allocate it behind
- if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
- desc.offset_ = offsetStruct;
- // Align the offset reference
- offset = amd::alignUp(offset, sizeof(size_t));
- patchReferences_.insert({desc.offset_, offset});
- offsetStruct += size;
- // Adjust the offset of arguments
- offset += sizeof(size_t);
- }
- else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
- (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
- // These objects have forced data size to uint64_t
- offset = amd::alignUp(offset, sizeof(uint64_t));
- desc.offset_ = offset;
- offset += sizeof(uint64_t);
- } else {
- offset = amd::alignUp(offset, arg->alignment_);
- desc.offset_ = offset;
- offset += size;
- }
-
- // Update read only flag
- desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
-
- params.push_back(desc);
- }
-
- // Save the number of OCL arguments
- uint32_t numParams = params.size();
- // Append the hidden arguments to the OCL arguments
- params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
- createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
-}
-#endif // defined(WITH_LIGHTNING_COMPILER)
-
Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
@@ -787,7 +44,7 @@ bool LightningKernel::init() {
if (kernelMD == nullptr) {
return false;
}
- initArguments(*kernelMD);
+ InitParameters(*kernelMD, KernargSegmentByteSize());
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
@@ -907,7 +164,7 @@ bool HSAILKernel::init() {
}
// Set the argList
- initArguments((const aclArgData*)argList.get());
+ InitParameters((const aclArgData*)argList.get(), KernargSegmentByteSize());
// Set the workgroup information for the kernel
memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
@@ -1151,11 +408,6 @@ void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) {
#endif // defined(WITH_COMPILER_LIB)
Kernel::~Kernel() {
- while (!hsailArgList_.empty()) {
- Argument* kernelArgPointer = hsailArgList_.back();
- delete kernelArgPointer;
- hsailArgList_.pop_back();
- }
}
} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
index 0c1c0f7e18..72b2b962d7 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
@@ -15,77 +15,8 @@ namespace roc {
#define MAX_INFO_STRING_LEN 0x40
-enum ROC_ARG_TYPE {
- ROC_ARGTYPE_ERROR = 0,
- ROC_ARGTYPE_POINTER,
- ROC_ARGTYPE_VALUE,
- ROC_ARGTYPE_REFERENCE,
- ROC_ARGTYPE_IMAGE,
- ROC_ARGTYPE_SAMPLER,
- ROC_ARGTYPE_QUEUE,
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X,
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y,
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z,
- ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER,
- ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE,
- ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION,
- ROC_ARGTYPE_HIDDEN_NONE,
- ROC_ARGMAX_ARG_TYPES
-};
-
-enum ROC_ADDRESS_QUALIFIER {
- ROC_ADDRESS_ERROR = 0,
- ROC_ADDRESS_GLOBAL,
- ROC_ADDRESS_CONSTANT,
- ROC_ADDRESS_LOCAL,
- ROC_MAX_ADDRESS_QUALIFIERS
-};
-
-enum ROC_DATA_TYPE {
- ROC_DATATYPE_ERROR = 0,
- ROC_DATATYPE_B1,
- ROC_DATATYPE_B8,
- ROC_DATATYPE_B16,
- ROC_DATATYPE_B32,
- ROC_DATATYPE_B64,
- ROC_DATATYPE_S8,
- ROC_DATATYPE_S16,
- ROC_DATATYPE_S32,
- ROC_DATATYPE_S64,
- ROC_DATATYPE_U8,
- ROC_DATATYPE_U16,
- ROC_DATATYPE_U32,
- ROC_DATATYPE_U64,
- ROC_DATATYPE_F16,
- ROC_DATATYPE_F32,
- ROC_DATATYPE_F64,
- ROC_DATATYPE_STRUCT,
- ROC_DATATYPE_OPAQUE,
- ROC_DATATYPE_MAX_TYPES
-};
-
-enum ROC_ACCESS_TYPE {
- ROC_ACCESS_TYPE_NONE = 0,
- ROC_ACCESS_TYPE_RO,
- ROC_ACCESS_TYPE_WO,
- ROC_ACCESS_TYPE_RW
-};
-
class Kernel : public device::Kernel {
public:
- struct Argument {
- uint index_; //!< Argument's index in the OCL signature
- std::string name_; //!< Argument's name
- std::string typeName_; //!< Argument's type name
- uint size_; //!< Size in bytes
- uint alignment_; //!< Argument's alignment
- uint pointeeAlignment_; //!< Alignment of the data pointed to
- ROC_ARG_TYPE type_; //!< Type of the argument
- ROC_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
- ROC_DATA_TYPE dataType_; //!< The type of data
- ROC_ACCESS_TYPE access_; //!< Access type for the argument
- };
-
Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
@@ -97,7 +28,7 @@ class Kernel : public device::Kernel {
const uint32_t workitemPrivateSegmentByteSize() const { return workitemPrivateSegmentByteSize_; }
- const uint64_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; }
+ const uint32_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; }
const uint8_t KernargSegmentAlignment() const { return kernargSegmentAlignment_; }
@@ -108,63 +39,18 @@ class Kernel : public device::Kernel {
const Program* program() const { return static_cast(program_); }
- //! Returns the kernel argument list
- const std::vector& hsailArgs() const { return hsailArgList_; }
-
- //! Returns a pointer to the hsail argument at the specified index
- Argument* hsailArgAt(size_t index) const {
- for (auto arg : hsailArgList_)
- if (arg->index_ == index) return arg;
- assert(!"Should not reach here");
- return nullptr;
- }
-
//! Return printf info array
const std::vector& printfInfo() const { return printf_; }
- //! Returns TRUE if kernel uses dynamic parallelism
- bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
-
- //! set dynamic parallelism flag
- void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; }
-
- //! Return TRUE if kernel is internal blit kernel
- bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
-
- //! set internal kernel flag
- void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
-
- //! Return TRUE if kernel uses images
- bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
-
- //! Return TRUE if kernel wirtes images
- bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
-
- const std::unordered_map& patch() const { return patchReferences_; }
-
protected:
- union Flags {
- struct {
- uint internalKernel_ : 1; //!< Is a blit kernel?
- uint imageEnable_ : 1; //!< Kernel uses images
- uint imageWrite_ : 1; //!< Kernel writes images
- uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
- };
- uint value_;
- Flags() : value_(0) {}
- } flags_;
-
-
- Program* program_; //!< The roc::Program context
- std::vector hsailArgList_; //!< Vector list of HSAIL Arguments
- uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
+ Program* program_; //!< The roc::Program context
+ uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
const uint32_t workgroupGroupSegmentByteSize_;
const uint32_t workitemPrivateSegmentByteSize_;
const uint32_t kernargSegmentByteSize_;
const uint32_t kernargSegmentAlignment_;
size_t kernelDirectiveOffset_;
std::vector printf_;
- std::unordered_map patchReferences_; //!< Patch table for references
};
#if defined(WITH_COMPILER_LIB)
@@ -183,9 +69,6 @@ class HSAILKernel : public roc::Kernel {
virtual bool init() final;
private:
- //! Populates hsailArgList_
- void initArguments(const aclArgData* aclArg);
-
//! Initializes HSAIL Printf metadata and info
void initPrintf(const aclPrintfFmt* aclPrintf);
};
@@ -206,9 +89,6 @@ class LightningKernel : public roc::Kernel {
virtual bool init() final;
private:
- //! Initializes Hsail Argument metadata and info for LC
- void initArguments(const KernelMD& kernelMD);
-
//! Initializes HSAIL Printf metadata and info for LC
void initPrintf(const std::vector& printfInfoStrings);
};