P4 to Git Change 1599157 by gandryey@gera-ocl-lc on 2018/08/28 17:11:04
SWDEV-79445 - OCL generic changes and code clean-up
- Add devkerenle.cpp/hpp files for device::Kernel object
- Move generic code for the arguments setup from the device layer to the abstraction layer
- Update ROCr and PAL paths to utilize the generic logic for the arguments setup
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#226 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#313 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#328 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#130 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#51 edit
[ROCm/clr commit: 5cfeb120ed]
This commit is contained in:
@@ -585,86 +585,6 @@ Settings::Settings() {
|
||||
//!< concurrent Virtual GPUs for default
|
||||
}
|
||||
|
||||
bool Kernel::createSignature(
|
||||
const parameters_t& params, uint32_t numParameters,
|
||||
uint32_t version) {
|
||||
std::stringstream attribs;
|
||||
if (workGroupInfo_.compileSize_[0] != 0) {
|
||||
attribs << "reqd_work_group_size(";
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
if (i != 0) {
|
||||
attribs << ",";
|
||||
}
|
||||
|
||||
attribs << workGroupInfo_.compileSize_[i];
|
||||
}
|
||||
attribs << ")";
|
||||
}
|
||||
if (workGroupInfo_.compileSizeHint_[0] != 0) {
|
||||
attribs << " work_group_size_hint(";
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
if (i != 0) {
|
||||
attribs << ",";
|
||||
}
|
||||
|
||||
attribs << workGroupInfo_.compileSizeHint_[i];
|
||||
}
|
||||
attribs << ")";
|
||||
}
|
||||
|
||||
if (!workGroupInfo_.compileVecTypeHint_.empty()) {
|
||||
attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")";
|
||||
}
|
||||
|
||||
// Destroy old signature if it was allocated before
|
||||
// (offline devices path)
|
||||
delete signature_;
|
||||
signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version);
|
||||
if (NULL != signature_) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Kernel::~Kernel() { delete signature_; }
|
||||
|
||||
std::string Kernel::openclMangledName(const std::string& name) {
|
||||
const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
|
||||
assert(bifSym && "symbol not found");
|
||||
return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
|
||||
}
|
||||
|
||||
void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
|
||||
const amd::Coord3D region, uint mapFlags, bool entire,
|
||||
amd::Image* baseMip) {
|
||||
// Map/Unmap must be serialized.
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
|
||||
WriteMapInfo info = {};
|
||||
WriteMapInfo* pInfo = &info;
|
||||
auto it = writeMapInfo_.find(mapAddress);
|
||||
if (it != writeMapInfo_.end()) {
|
||||
LogWarning("Double map of the same or overlapped region!");
|
||||
pInfo = &it->second;
|
||||
}
|
||||
|
||||
if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
|
||||
pInfo->origin_ = origin;
|
||||
pInfo->region_ = region;
|
||||
pInfo->entire_ = entire;
|
||||
pInfo->unmapWrite_ = true;
|
||||
}
|
||||
if (mapFlags & CL_MAP_READ) {
|
||||
pInfo->unmapRead_ = true;
|
||||
}
|
||||
pInfo->baseMip_ = baseMip;
|
||||
|
||||
// Insert into the map if it's the first region
|
||||
if (++pInfo->count_ == 1) {
|
||||
writeMapInfo_.insert({mapAddress, info});
|
||||
}
|
||||
}
|
||||
|
||||
Program::Program(amd::Device& device)
|
||||
: device_(device),
|
||||
type_(TYPE_NONE),
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "amdocl/cl_kernel.h"
|
||||
#include "elf/elf.hpp"
|
||||
#include "appprofile.hpp"
|
||||
#include "devkernel.hpp"
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
#include "caching/cache.hpp"
|
||||
@@ -54,7 +55,6 @@ class PerfCounterCommand;
|
||||
class ReleaseObjectCommand;
|
||||
class StallQueueCommand;
|
||||
class Marker;
|
||||
class KernelSignature;
|
||||
class ThreadTraceCommand;
|
||||
class ThreadTraceMemObjectsCommand;
|
||||
class SignalCommand;
|
||||
@@ -74,9 +74,6 @@ namespace option {
|
||||
class Options;
|
||||
} // option
|
||||
|
||||
struct ProfilingCallback : public amd::HeapObject {
|
||||
virtual void callback(ulong duration, uint32_t waves) = 0;
|
||||
};
|
||||
}
|
||||
|
||||
enum OclExtensions {
|
||||
@@ -176,6 +173,7 @@ static constexpr int AmdVendor = 0x1002;
|
||||
namespace device {
|
||||
class ClBinary;
|
||||
class BlitManager;
|
||||
class Kernel;
|
||||
|
||||
//! Physical device properties.
|
||||
struct Info : public amd::EmbeddedObject {
|
||||
@@ -776,143 +774,6 @@ class Sampler : public amd::HeapObject {
|
||||
Sampler(const Sampler&);
|
||||
};
|
||||
|
||||
//! \class DeviceKernel, which will contain the common fields for any device
|
||||
class Kernel : public amd::HeapObject {
|
||||
public:
|
||||
typedef std::vector<amd::KernelParameterDescriptor> parameters_t;
|
||||
|
||||
//! \struct The device kernel workgroup info structure
|
||||
struct WorkGroupInfo : public amd::EmbeddedObject {
|
||||
size_t size_; //!< kernel workgroup size
|
||||
size_t compileSize_[3]; //!< kernel compiled workgroup size
|
||||
cl_ulong localMemSize_; //!< amount of used local memory
|
||||
size_t preferredSizeMultiple_; //!< preferred multiple for launch
|
||||
cl_ulong privateMemSize_; //!< amount of used private memory
|
||||
size_t scratchRegs_; //!< amount of used scratch registers
|
||||
size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD
|
||||
size_t wavefrontSize_; //!< number of threads per wavefront
|
||||
size_t availableGPRs_; //!< GPRs available to the program
|
||||
size_t usedGPRs_; //!< GPRs used by the program
|
||||
size_t availableSGPRs_; //!< SGPRs available to the program
|
||||
size_t usedSGPRs_; //!< SGPRs used by the program
|
||||
size_t availableVGPRs_; //!< VGPRs available to the program
|
||||
size_t usedVGPRs_; //!< VGPRs used by the program
|
||||
size_t availableLDSSize_; //!< available LDS size
|
||||
size_t usedLDSSize_; //!< used LDS size
|
||||
size_t availableStackSize_; //!< available stack size
|
||||
size_t usedStackSize_; //!< used stack size
|
||||
size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint
|
||||
std::string compileVecTypeHint_; //!< kernel compiled vector type hint
|
||||
bool uniformWorkGroupSize_; //!< uniform work group size option
|
||||
size_t wavesPerSimdHint_; //!< waves per simd hit
|
||||
};
|
||||
|
||||
//! Default constructor
|
||||
Kernel(const std::string& name) : name_(name), signature_(NULL), hsa_(false) {
|
||||
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
|
||||
// Due to std::string not being able to be memset to 0
|
||||
workGroupInfo_.size_ = 0;
|
||||
workGroupInfo_.compileSize_[0] = 0;
|
||||
workGroupInfo_.compileSize_[1] = 0;
|
||||
workGroupInfo_.compileSize_[2] = 0;
|
||||
workGroupInfo_.localMemSize_ = 0;
|
||||
workGroupInfo_.preferredSizeMultiple_ = 0;
|
||||
workGroupInfo_.privateMemSize_ = 0;
|
||||
workGroupInfo_.scratchRegs_ = 0;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = 0;
|
||||
workGroupInfo_.wavefrontSize_ = 0;
|
||||
workGroupInfo_.availableGPRs_ = 0;
|
||||
workGroupInfo_.usedGPRs_ = 0;
|
||||
workGroupInfo_.availableSGPRs_ = 0;
|
||||
workGroupInfo_.usedSGPRs_ = 0;
|
||||
workGroupInfo_.availableVGPRs_ = 0;
|
||||
workGroupInfo_.usedVGPRs_ = 0;
|
||||
workGroupInfo_.availableLDSSize_ = 0;
|
||||
workGroupInfo_.usedLDSSize_ = 0;
|
||||
workGroupInfo_.availableStackSize_ = 0;
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.compileSizeHint_[0] = 0;
|
||||
workGroupInfo_.compileSizeHint_[1] = 0;
|
||||
workGroupInfo_.compileSizeHint_[2] = 0;
|
||||
workGroupInfo_.compileVecTypeHint_ = "";
|
||||
workGroupInfo_.uniformWorkGroupSize_ = false;
|
||||
workGroupInfo_.wavesPerSimdHint_ = 0;
|
||||
}
|
||||
|
||||
//! Default destructor
|
||||
virtual ~Kernel();
|
||||
|
||||
//! Returns the kernel info structure
|
||||
const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; }
|
||||
|
||||
//! Returns the kernel signature
|
||||
const amd::KernelSignature& signature() const { return *signature_; }
|
||||
|
||||
//! Returns the kernel name
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
//! Initializes the kernel parameters for the abstraction layer
|
||||
bool createSignature(
|
||||
const parameters_t& params, uint32_t numParameters,
|
||||
uint32_t version);
|
||||
|
||||
//! Returns TRUE if it's a HSA kernel
|
||||
bool hsa() const { return hsa_; }
|
||||
|
||||
void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; }
|
||||
|
||||
bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; }
|
||||
|
||||
void setReqdWorkGroupSize(size_t x, size_t y, size_t z) {
|
||||
workGroupInfo_.compileSize_[0] = x;
|
||||
workGroupInfo_.compileSize_[1] = y;
|
||||
workGroupInfo_.compileSize_[2] = z;
|
||||
}
|
||||
|
||||
size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; }
|
||||
|
||||
void setWorkGroupSizeHint(size_t x, size_t y, size_t z) {
|
||||
workGroupInfo_.compileSizeHint_[0] = x;
|
||||
workGroupInfo_.compileSizeHint_[1] = y;
|
||||
workGroupInfo_.compileSizeHint_[2] = z;
|
||||
}
|
||||
|
||||
size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
|
||||
|
||||
//! Get profiling callback object
|
||||
virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
|
||||
|
||||
void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
|
||||
|
||||
void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; }
|
||||
|
||||
//! Return the build log
|
||||
const std::string& buildLog() const { return buildLog_; }
|
||||
|
||||
static std::string openclMangledName(const std::string& name);
|
||||
|
||||
protected:
|
||||
std::string name_; //!< kernel name
|
||||
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
|
||||
amd::KernelSignature* signature_; //!< kernel signature
|
||||
bool hsa_; //!< True if HSA kernel on GPU
|
||||
std::string buildLog_; //!< build log
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
Kernel(const Kernel&);
|
||||
|
||||
//! Disable operator=
|
||||
Kernel& operator=(const Kernel&);
|
||||
};
|
||||
|
||||
//! A program object for a specific device.
|
||||
class Program : public amd::HeapObject {
|
||||
public:
|
||||
@@ -1615,47 +1476,6 @@ class Device : public RuntimeObject {
|
||||
std::map<uintptr_t, device::Memory*>* vaCacheMap_; //!< VA cache map
|
||||
};
|
||||
|
||||
struct KernelParameterDescriptor {
|
||||
enum {
|
||||
Value = 0,
|
||||
HiddenNone = 1,
|
||||
HiddenGlobalOffsetX = 2,
|
||||
HiddenGlobalOffsetY = 3,
|
||||
HiddenGlobalOffsetZ = 4,
|
||||
HiddenPrintfBuffer = 5,
|
||||
HiddenDefaultQueue = 6,
|
||||
HiddenCompletionAction = 7,
|
||||
MemoryObject = 8,
|
||||
ReferenceObject = 9,
|
||||
ValueObject = 10,
|
||||
ImageObject = 11,
|
||||
SamplerObject = 12,
|
||||
QueueObject = 13
|
||||
};
|
||||
clk_value_type_t type_; //!< The parameter's type
|
||||
size_t offset_; //!< Its offset in the parameter's stack
|
||||
size_t size_; //!< Its size in bytes
|
||||
union InfoData {
|
||||
struct {
|
||||
uint32_t oclObject_ : 4; //!< OCL object type
|
||||
uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
|
||||
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
|
||||
uint32_t defined_ : 1; //!< The argument was defined by the app
|
||||
uint32_t reserved_ : 1; //!< reserved
|
||||
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
|
||||
};
|
||||
uint32_t allValues_;
|
||||
InfoData() : allValues_(0) {}
|
||||
} info_;
|
||||
|
||||
cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier
|
||||
cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier
|
||||
cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier
|
||||
|
||||
std::string name_; //!< The parameter's name in the source
|
||||
std::string typeName_; //!< Argument's type name
|
||||
};
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
//! Compilation process with cache support.
|
||||
class CacheCompilation : public amd::HeapObject {
|
||||
|
||||
@@ -0,0 +1,772 @@
|
||||
//
|
||||
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "platform/runtime.hpp"
|
||||
#include "platform/program.hpp"
|
||||
#include "devkernel.hpp"
|
||||
#include "utils/macros.hpp"
|
||||
#include "utils/options.hpp"
|
||||
#include "utils/bif_section_labels.hpp"
|
||||
#include "utils/libUtils.h"
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "acl.h"
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
#include "llvm/Support/AMDGPUMetadata.h"
|
||||
|
||||
typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
namespace device {
|
||||
|
||||
bool Kernel::createSignature(
|
||||
const parameters_t& params, uint32_t numParameters,
|
||||
uint32_t version) {
|
||||
std::stringstream attribs;
|
||||
if (workGroupInfo_.compileSize_[0] != 0) {
|
||||
attribs << "reqd_work_group_size(";
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
if (i != 0) {
|
||||
attribs << ",";
|
||||
}
|
||||
|
||||
attribs << workGroupInfo_.compileSize_[i];
|
||||
}
|
||||
attribs << ")";
|
||||
}
|
||||
if (workGroupInfo_.compileSizeHint_[0] != 0) {
|
||||
attribs << " work_group_size_hint(";
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
if (i != 0) {
|
||||
attribs << ",";
|
||||
}
|
||||
|
||||
attribs << workGroupInfo_.compileSizeHint_[i];
|
||||
}
|
||||
attribs << ")";
|
||||
}
|
||||
|
||||
if (!workGroupInfo_.compileVecTypeHint_.empty()) {
|
||||
attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")";
|
||||
}
|
||||
|
||||
// Destroy old signature if it was allocated before
|
||||
// (offline devices path)
|
||||
delete signature_;
|
||||
signature_ = new amd::KernelSignature(params, attribs.str(), numParameters, version);
|
||||
if (NULL != signature_) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Kernel::~Kernel() { delete signature_; }
|
||||
|
||||
std::string Kernel::openclMangledName(const std::string& name) {
|
||||
const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
|
||||
assert(bifSym && "symbol not found");
|
||||
return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
|
||||
}
|
||||
|
||||
void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin,
|
||||
const amd::Coord3D region, uint mapFlags, bool entire,
|
||||
amd::Image* baseMip) {
|
||||
// Map/Unmap must be serialized.
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
|
||||
WriteMapInfo info = {};
|
||||
WriteMapInfo* pInfo = &info;
|
||||
auto it = writeMapInfo_.find(mapAddress);
|
||||
if (it != writeMapInfo_.end()) {
|
||||
LogWarning("Double map of the same or overlapped region!");
|
||||
pInfo = &it->second;
|
||||
}
|
||||
|
||||
if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
|
||||
pInfo->origin_ = origin;
|
||||
pInfo->region_ = region;
|
||||
pInfo->entire_ = entire;
|
||||
pInfo->unmapWrite_ = true;
|
||||
}
|
||||
if (mapFlags & CL_MAP_READ) {
|
||||
pInfo->unmapRead_ = true;
|
||||
}
|
||||
pInfo->baseMip_ = baseMip;
|
||||
|
||||
// Insert into the map if it's the first region
|
||||
if (++pInfo->count_ == 1) {
|
||||
writeMapInfo_.insert({ mapAddress, info });
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
using llvm::AMDGPU::HSAMD::AccessQualifier;
|
||||
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
||||
using llvm::AMDGPU::HSAMD::ValueKind;
|
||||
using llvm::AMDGPU::HSAMD::ValueType;
|
||||
|
||||
static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) {
|
||||
switch (lcArg.mValueKind) {
|
||||
case ValueKind::GlobalBuffer:
|
||||
case ValueKind::DynamicSharedPointer:
|
||||
case ValueKind::Pipe:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ValueKind::ByValue:
|
||||
return amd::KernelParameterDescriptor::ValueObject;
|
||||
case ValueKind::Image:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ValueKind::Sampler:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ValueKind::HiddenGlobalOffsetX:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
case ValueKind::HiddenGlobalOffsetY:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
case ValueKind::HiddenGlobalOffsetZ:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
case ValueKind::HiddenPrintfBuffer:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
case ValueKind::HiddenDefaultQueue:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
case ValueKind::HiddenCompletionAction:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case ValueKind::HiddenNone:
|
||||
default:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
|
||||
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
|
||||
*isHidden = true;
|
||||
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
}
|
||||
else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
}
|
||||
else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
}
|
||||
else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
}
|
||||
else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
}
|
||||
else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
}
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ARG_TYPE_QUEUE:
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
case ARG_TYPE_VALUE:
|
||||
return (argInfo->arg.value.data == DATATYPE_struct) ?
|
||||
amd::KernelParameterDescriptor::ReferenceObject :
|
||||
amd::KernelParameterDescriptor::ValueObject;
|
||||
case ARG_TYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ARG_TYPE_ERROR:
|
||||
default:
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static const clk_value_type_t ClkValueMapType[6][6] = {
|
||||
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
|
||||
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
|
||||
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
|
||||
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
|
||||
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
|
||||
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
|
||||
};
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
|
||||
uint sizeType;
|
||||
uint numElements;
|
||||
|
||||
if (lcArg.mValueKind != ValueKind::ByValue) {
|
||||
switch (lcArg.mValueKind) {
|
||||
case ValueKind::GlobalBuffer:
|
||||
case ValueKind::DynamicSharedPointer:
|
||||
case ValueKind::Pipe:
|
||||
case ValueKind::Image:
|
||||
return T_POINTER;
|
||||
case ValueKind::Sampler:
|
||||
return T_SAMPLER;
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch (lcArg.mValueType) {
|
||||
case ValueType::I8:
|
||||
case ValueType::U8:
|
||||
sizeType = 0;
|
||||
numElements = size;
|
||||
break;
|
||||
case ValueType::I16:
|
||||
case ValueType::U16:
|
||||
sizeType = 1;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case ValueType::I32:
|
||||
case ValueType::U32:
|
||||
sizeType = 2;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case ValueType::I64:
|
||||
case ValueType::U64:
|
||||
sizeType = 3;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case ValueType::F16:
|
||||
sizeType = 4;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case ValueType::F32:
|
||||
sizeType = 4;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case ValueType::F64:
|
||||
sizeType = 5;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case ValueType::Struct:
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
switch (numElements) {
|
||||
case 1:
|
||||
return ClkValueMapType[sizeType][0];
|
||||
case 2:
|
||||
return ClkValueMapType[sizeType][1];
|
||||
case 3:
|
||||
return ClkValueMapType[sizeType][2];
|
||||
case 4:
|
||||
return ClkValueMapType[sizeType][3];
|
||||
case 8:
|
||||
return ClkValueMapType[sizeType][4];
|
||||
case 16:
|
||||
return ClkValueMapType[sizeType][5];
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
return T_VOID;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
|
||||
uint sizeType;
|
||||
uint numElements;
|
||||
if (argInfo->type == ARG_TYPE_QUEUE) {
|
||||
return T_QUEUE;
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) {
|
||||
return T_POINTER;
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_VALUE) {
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
sizeType = 0;
|
||||
numElements = size;
|
||||
break;
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_u16:
|
||||
sizeType = 1;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_u32:
|
||||
sizeType = 2;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
sizeType = 3;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case DATATYPE_f16:
|
||||
sizeType = 4;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case DATATYPE_f32:
|
||||
sizeType = 4;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case DATATYPE_f64:
|
||||
sizeType = 5;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case DATATYPE_struct:
|
||||
case DATATYPE_opaque:
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
|
||||
switch (numElements) {
|
||||
case 1:
|
||||
return ClkValueMapType[sizeType][0];
|
||||
case 2:
|
||||
return ClkValueMapType[sizeType][1];
|
||||
case 3:
|
||||
return ClkValueMapType[sizeType][2];
|
||||
case 4:
|
||||
return ClkValueMapType[sizeType][3];
|
||||
case 8:
|
||||
return ClkValueMapType[sizeType][4];
|
||||
case 16:
|
||||
return ClkValueMapType[sizeType][5];
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
||||
return T_SAMPLER;
|
||||
}
|
||||
else {
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
return 1;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8;
|
||||
case DATATYPE_struct:
|
||||
return 128;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
return sizeof(cl_mem);
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return sizeof(cl_sampler);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
uint32_t align = lcArg.mPointeeAlign;
|
||||
if (align == 0) {
|
||||
LogWarning("Missing DynamicSharedPointer alignment");
|
||||
align = 128; /* worst case alignment */
|
||||
}
|
||||
return align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
return argInfo->arg.pointer.align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
|
||||
if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
|
||||
switch (lcArg.mAccQual) {
|
||||
case AccessQualifier::ReadOnly:
|
||||
return true;
|
||||
case AccessQualifier::WriteOnly:
|
||||
case AccessQualifier::ReadWrite:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false;
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_IMAGE) {
|
||||
return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
inline static int GetArgSizeOCL(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
case DATATYPE_struct:
|
||||
return 1 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
case ARG_TYPE_SAMPLER:
|
||||
case ARG_TYPE_QUEUE:
|
||||
return sizeof(void*);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
||||
}
|
||||
else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
|
||||
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global ||
|
||||
lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
|
||||
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
||||
}
|
||||
LogError("Unsupported address type");
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
// default for all other cases
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
case PTR_MT_CONSTANT:
|
||||
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
||||
case PTR_MT_UAV:
|
||||
case PTR_MT_GLOBAL:
|
||||
case PTR_MT_SCRATCH_EMU:
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
case PTR_MT_LDS_EMU:
|
||||
case PTR_MT_LDS:
|
||||
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
||||
case PTR_MT_ERROR:
|
||||
default:
|
||||
LogError("Unsupported address type");
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
}
|
||||
else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
|
||||
// default for all other cases
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::Image) {
|
||||
switch (lcArg.mAccQual) {
|
||||
case AccessQualifier::ReadOnly:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
||||
case AccessQualifier::WriteOnly:
|
||||
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
||||
case AccessQualifier::ReadWrite:
|
||||
default:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
||||
}
|
||||
}
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_IMAGE) {
|
||||
switch (argInfo->arg.image.type) {
|
||||
case ACCESS_TYPE_RO:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
||||
case ACCESS_TYPE_WO:
|
||||
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
||||
default:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
||||
}
|
||||
}
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
|
||||
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
if (lcArg.mIsVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (lcArg.mIsRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (lcArg.mIsConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
}
|
||||
else if (lcArg.mIsPipe) {
|
||||
assert(lcArg.mValueKind == ValueKind::Pipe);
|
||||
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
if (argInfo->arg.pointer.isVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (argInfo->arg.pointer.isRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (argInfo->arg.pointer.isPipe) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
||||
}
|
||||
if (argInfo->isConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_CONSTANT:
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argBufferSize;
|
||||
|
||||
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
|
||||
const KernelArgMD& lcArg = kernelMD.mArgs[i];
|
||||
|
||||
size_t size = GetArgSizeOCL(lcArg);
|
||||
size_t alignment = GetArgAlignmentOCL(lcArg);
|
||||
bool isHidden = false;
|
||||
desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = size;
|
||||
offset += size;
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = lcArg.mName.c_str();
|
||||
desc.type_ = GetOclTypeOCL(lcArg, size);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
|
||||
desc.addressQualifier_ = GetOclAddrQualOCL(lcArg);
|
||||
desc.accessQualifier_ = GetOclAccessQualOCL(lcArg);
|
||||
desc.typeQualifier_ = GetOclTypeQualOCL(lcArg);
|
||||
desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg);
|
||||
desc.size_ = size;
|
||||
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
else {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = GetReadOnlyOCL(lcArg);
|
||||
|
||||
params.push_back(desc);
|
||||
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
flags_.imageEna_ = true;
|
||||
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
||||
flags_.imageWriteEna_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argBufferSize;
|
||||
|
||||
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
||||
size_t size = GetArgSizeOCL(aclArg);
|
||||
size_t alignment = GetArgAlignmentOCL(aclArg);
|
||||
bool isHidden = false;
|
||||
desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden);
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = size;
|
||||
offset += size;
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = aclArg->argStr;
|
||||
desc.typeName_ = aclArg->typeStr;
|
||||
desc.type_ = GetOclTypeOCL(aclArg, size);
|
||||
|
||||
desc.addressQualifier_ = GetOclAddrQualOCL(aclArg);
|
||||
desc.accessQualifier_ = GetOclAccessQualOCL(aclArg);
|
||||
desc.typeQualifier_ = GetOclTypeQualOCL(aclArg);
|
||||
desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg);
|
||||
desc.size_ = size;
|
||||
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({ desc.offset_, offset });
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else {
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
else {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
}
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = GetReadOnlyOCL(aclArg);
|
||||
|
||||
params.push_back(desc);
|
||||
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
flags_.imageEna_ = true;
|
||||
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
||||
flags_.imageWriteEna_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -0,0 +1,269 @@
|
||||
//
|
||||
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include "include/aclTypes.h"
|
||||
#include "platform/context.hpp"
|
||||
#include "platform/object.hpp"
|
||||
#include "platform/memory.hpp"
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
namespace llvm {
|
||||
namespace AMDGPU {
|
||||
namespace HSAMD {
|
||||
namespace Kernel {
|
||||
struct Metadata;
|
||||
}}}}
|
||||
typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
namespace amd {
|
||||
namespace hsa {
|
||||
namespace loader {
|
||||
class Symbol;
|
||||
} // loader
|
||||
namespace code {
|
||||
namespace Kernel {
|
||||
class Metadata;
|
||||
} // Kernel
|
||||
} // code
|
||||
} // hsa
|
||||
} // amd
|
||||
|
||||
namespace amd {
|
||||
|
||||
class Device;
|
||||
class KernelSignature;
|
||||
|
||||
struct ProfilingCallback : public amd::HeapObject {
|
||||
virtual void callback(ulong duration, uint32_t waves) = 0;
|
||||
};
|
||||
|
||||
struct KernelParameterDescriptor {
|
||||
enum {
|
||||
Value = 0,
|
||||
HiddenNone = 1,
|
||||
HiddenGlobalOffsetX = 2,
|
||||
HiddenGlobalOffsetY = 3,
|
||||
HiddenGlobalOffsetZ = 4,
|
||||
HiddenPrintfBuffer = 5,
|
||||
HiddenDefaultQueue = 6,
|
||||
HiddenCompletionAction = 7,
|
||||
MemoryObject = 8,
|
||||
ReferenceObject = 9,
|
||||
ValueObject = 10,
|
||||
ImageObject = 11,
|
||||
SamplerObject = 12,
|
||||
QueueObject = 13
|
||||
};
|
||||
clk_value_type_t type_; //!< The parameter's type
|
||||
size_t offset_; //!< Its offset in the parameter's stack
|
||||
size_t size_; //!< Its size in bytes
|
||||
union InfoData {
|
||||
struct {
|
||||
uint32_t oclObject_ : 4; //!< OCL object type
|
||||
uint32_t readOnly_ : 1; //!< OCL object is read only, applied to memory only
|
||||
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
|
||||
uint32_t defined_ : 1; //!< The argument was defined by the app
|
||||
uint32_t reserved_ : 1; //!< reserved
|
||||
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
|
||||
};
|
||||
uint32_t allValues_;
|
||||
InfoData() : allValues_(0) {}
|
||||
} info_;
|
||||
|
||||
cl_kernel_arg_address_qualifier addressQualifier_; //!< Argument's address qualifier
|
||||
cl_kernel_arg_access_qualifier accessQualifier_; //!< Argument's access qualifier
|
||||
cl_kernel_arg_type_qualifier typeQualifier_; //!< Argument's type qualifier
|
||||
|
||||
std::string name_; //!< The parameter's name in the source
|
||||
std::string typeName_; //!< Argument's type name
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace device {
|
||||
|
||||
//! \class DeviceKernel, which will contain the common fields for any device
|
||||
class Kernel : public amd::HeapObject {
|
||||
public:
|
||||
typedef std::vector<amd::KernelParameterDescriptor> parameters_t;
|
||||
|
||||
//! \struct The device kernel workgroup info structure
|
||||
struct WorkGroupInfo : public amd::EmbeddedObject {
|
||||
size_t size_; //!< kernel workgroup size
|
||||
size_t compileSize_[3]; //!< kernel compiled workgroup size
|
||||
cl_ulong localMemSize_; //!< amount of used local memory
|
||||
size_t preferredSizeMultiple_; //!< preferred multiple for launch
|
||||
cl_ulong privateMemSize_; //!< amount of used private memory
|
||||
size_t scratchRegs_; //!< amount of used scratch registers
|
||||
size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD
|
||||
size_t wavefrontSize_; //!< number of threads per wavefront
|
||||
size_t availableGPRs_; //!< GPRs available to the program
|
||||
size_t usedGPRs_; //!< GPRs used by the program
|
||||
size_t availableSGPRs_; //!< SGPRs available to the program
|
||||
size_t usedSGPRs_; //!< SGPRs used by the program
|
||||
size_t availableVGPRs_; //!< VGPRs available to the program
|
||||
size_t usedVGPRs_; //!< VGPRs used by the program
|
||||
size_t availableLDSSize_; //!< available LDS size
|
||||
size_t usedLDSSize_; //!< used LDS size
|
||||
size_t availableStackSize_; //!< available stack size
|
||||
size_t usedStackSize_; //!< used stack size
|
||||
size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint
|
||||
std::string compileVecTypeHint_; //!< kernel compiled vector type hint
|
||||
bool uniformWorkGroupSize_; //!< uniform work group size option
|
||||
size_t wavesPerSimdHint_; //!< waves per simd hit
|
||||
};
|
||||
|
||||
//! Default constructor
|
||||
Kernel(const std::string& name) : name_(name), signature_(NULL) {
|
||||
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
|
||||
// Due to std::string not being able to be memset to 0
|
||||
workGroupInfo_.size_ = 0;
|
||||
workGroupInfo_.compileSize_[0] = 0;
|
||||
workGroupInfo_.compileSize_[1] = 0;
|
||||
workGroupInfo_.compileSize_[2] = 0;
|
||||
workGroupInfo_.localMemSize_ = 0;
|
||||
workGroupInfo_.preferredSizeMultiple_ = 0;
|
||||
workGroupInfo_.privateMemSize_ = 0;
|
||||
workGroupInfo_.scratchRegs_ = 0;
|
||||
workGroupInfo_.wavefrontPerSIMD_ = 0;
|
||||
workGroupInfo_.wavefrontSize_ = 0;
|
||||
workGroupInfo_.availableGPRs_ = 0;
|
||||
workGroupInfo_.usedGPRs_ = 0;
|
||||
workGroupInfo_.availableSGPRs_ = 0;
|
||||
workGroupInfo_.usedSGPRs_ = 0;
|
||||
workGroupInfo_.availableVGPRs_ = 0;
|
||||
workGroupInfo_.usedVGPRs_ = 0;
|
||||
workGroupInfo_.availableLDSSize_ = 0;
|
||||
workGroupInfo_.usedLDSSize_ = 0;
|
||||
workGroupInfo_.availableStackSize_ = 0;
|
||||
workGroupInfo_.usedStackSize_ = 0;
|
||||
workGroupInfo_.compileSizeHint_[0] = 0;
|
||||
workGroupInfo_.compileSizeHint_[1] = 0;
|
||||
workGroupInfo_.compileSizeHint_[2] = 0;
|
||||
workGroupInfo_.compileVecTypeHint_ = "";
|
||||
workGroupInfo_.uniformWorkGroupSize_ = false;
|
||||
workGroupInfo_.wavesPerSimdHint_ = 0;
|
||||
}
|
||||
|
||||
//! Default destructor
|
||||
virtual ~Kernel();
|
||||
|
||||
//! Returns the kernel info structure
|
||||
const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; }
|
||||
|
||||
//! Returns the kernel signature
|
||||
const amd::KernelSignature& signature() const { return *signature_; }
|
||||
|
||||
//! Returns the kernel name
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
//! Initializes the kernel parameters for the abstraction layer
|
||||
bool createSignature(
|
||||
const parameters_t& params, uint32_t numParameters,
|
||||
uint32_t version);
|
||||
|
||||
void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; }
|
||||
|
||||
bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; }
|
||||
|
||||
void setReqdWorkGroupSize(size_t x, size_t y, size_t z) {
|
||||
workGroupInfo_.compileSize_[0] = x;
|
||||
workGroupInfo_.compileSize_[1] = y;
|
||||
workGroupInfo_.compileSize_[2] = z;
|
||||
}
|
||||
|
||||
size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; }
|
||||
|
||||
void setWorkGroupSizeHint(size_t x, size_t y, size_t z) {
|
||||
workGroupInfo_.compileSizeHint_[0] = x;
|
||||
workGroupInfo_.compileSizeHint_[1] = y;
|
||||
workGroupInfo_.compileSizeHint_[2] = z;
|
||||
}
|
||||
|
||||
size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
|
||||
|
||||
//! Get profiling callback object
|
||||
virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }
|
||||
|
||||
void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; }
|
||||
|
||||
void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; }
|
||||
|
||||
//! Return the build log
|
||||
const std::string& buildLog() const { return buildLog_; }
|
||||
|
||||
static std::string openclMangledName(const std::string& name);
|
||||
|
||||
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
|
||||
|
||||
//! Returns TRUE if kernel uses dynamic parallelism
|
||||
bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
|
||||
|
||||
//! set dynamic parallelism flag
|
||||
void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; }
|
||||
|
||||
//! Returns TRUE if kernel is internal kernel
|
||||
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! set internal kernel flag
|
||||
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
|
||||
|
||||
//! Return TRUE if kernel uses images
|
||||
bool imageEnable() const { return (flags_.imageEna_) ? true : false; }
|
||||
|
||||
//! Return TRUE if kernel wirtes images
|
||||
bool imageWrite() const { return (flags_.imageWriteEna_) ? true : false; }
|
||||
|
||||
//! Returns TRUE if it's a HSA kernel
|
||||
bool hsa() const { return (flags_.hsa_) ? true : false; }
|
||||
|
||||
protected:
|
||||
//! Initializes the abstraction layer kernel parameters
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
|
||||
#endif
|
||||
#if defined(WITH_COMPILER_LIB) || !defined(WITH_LIGHTNING_COMPILER)
|
||||
void InitParameters(
|
||||
const aclArgData* aclArg, //!< List of ACL arguments
|
||||
uint32_t argBufferSize
|
||||
);
|
||||
#endif
|
||||
std::string name_; //!< kernel name
|
||||
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
|
||||
amd::KernelSignature* signature_; //!< kernel signature
|
||||
std::string buildLog_; //!< build log
|
||||
|
||||
union Flags {
|
||||
struct {
|
||||
uint imageEna_ : 1; //!< Kernel uses images
|
||||
uint imageWriteEna_ : 1; //!< Kernel uses image writes
|
||||
uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
|
||||
uint internalKernel_ : 1; //!< True: internal kernel
|
||||
uint hsa_ : 1; //!< HSA kernel
|
||||
};
|
||||
uint value_;
|
||||
Flags() : value_(0) {}
|
||||
} flags_;
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
Kernel(const Kernel&);
|
||||
|
||||
//! Disable operator=
|
||||
Kernel& operator=(const Kernel&);
|
||||
|
||||
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
@@ -823,7 +823,6 @@ Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& pro
|
||||
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
|
||||
// Default wavesPerSimdHint_
|
||||
workGroupInfo_.wavesPerSimdHint_ = ~0U;
|
||||
hsa_ = false;
|
||||
}
|
||||
|
||||
Kernel::~Kernel() {
|
||||
@@ -3127,7 +3126,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
|
||||
extraArgumentsNum_(extraArgsNum),
|
||||
waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) *
|
||||
dev().hwInfo()->simdPerCU_) {
|
||||
hsa_ = true;
|
||||
flags_.hsa_ = true;
|
||||
}
|
||||
|
||||
HSAILKernel::~HSAILKernel() {
|
||||
|
||||
@@ -814,12 +814,6 @@ class HSAILKernel : public device::Kernel {
|
||||
//! Returns spill reg size per workitem
|
||||
int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; }
|
||||
|
||||
//! Returns TRUE if kernel uses dynamic parallelism
|
||||
bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
|
||||
|
||||
//! Returns TRUE if kernel is internal kernel
|
||||
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! Finds local workgroup size
|
||||
void findLocalWorkSize(size_t workDim, //!< Work dimension
|
||||
const amd::NDRange& gblWorkSize, //!< Global work size
|
||||
@@ -895,17 +889,6 @@ class HSAILKernel : public device::Kernel {
|
||||
|
||||
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
|
||||
|
||||
union Flags {
|
||||
struct {
|
||||
uint imageEna_ : 1; //!< Kernel uses images
|
||||
uint imageWriteEna_ : 1; //!< Kernel uses image writes
|
||||
uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
|
||||
uint internalKernel_ : 1; //!< True: internal kernel
|
||||
};
|
||||
uint value_;
|
||||
Flags() : value_(0) {}
|
||||
} flags_;
|
||||
|
||||
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
|
||||
};
|
||||
|
||||
|
||||
@@ -21,654 +21,6 @@
|
||||
|
||||
namespace pal {
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
using llvm::AMDGPU::HSAMD::AccessQualifier;
|
||||
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
||||
using llvm::AMDGPU::HSAMD::ValueKind;
|
||||
using llvm::AMDGPU::HSAMD::ValueType;
|
||||
|
||||
static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isHidden) {
|
||||
switch (lcArg.mValueKind) {
|
||||
case ValueKind::GlobalBuffer:
|
||||
case ValueKind::DynamicSharedPointer:
|
||||
case ValueKind::Pipe:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ValueKind::ByValue:
|
||||
return amd::KernelParameterDescriptor::ValueObject;
|
||||
case ValueKind::Image:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ValueKind::Sampler:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ValueKind::HiddenGlobalOffsetX:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
case ValueKind::HiddenGlobalOffsetY:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
case ValueKind::HiddenGlobalOffsetZ:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
case ValueKind::HiddenPrintfBuffer:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
case ValueKind::HiddenDefaultQueue:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
case ValueKind::HiddenCompletionAction:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case ValueKind::HiddenNone:
|
||||
default:
|
||||
*isHidden = true;
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* isHidden) {
|
||||
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
|
||||
*isHidden = true;
|
||||
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
} else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
} else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
} else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
} else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
} else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
}
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ARG_TYPE_QUEUE:
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
case ARG_TYPE_VALUE:
|
||||
return (argInfo->arg.value.data == DATATYPE_struct) ?
|
||||
amd::KernelParameterDescriptor::ReferenceObject :
|
||||
amd::KernelParameterDescriptor::ValueObject;
|
||||
case ARG_TYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ARG_TYPE_ERROR:
|
||||
default:
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static const clk_value_type_t ClkValueMapType[6][6] = {
|
||||
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
|
||||
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
|
||||
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
|
||||
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
|
||||
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
|
||||
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
|
||||
};
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline clk_value_type_t GetOclTypeOCL(const KernelArgMD& lcArg, size_t size = 0) {
|
||||
uint sizeType;
|
||||
uint numElements;
|
||||
|
||||
if (lcArg.mValueKind != ValueKind::ByValue) {
|
||||
switch (lcArg.mValueKind) {
|
||||
case ValueKind::GlobalBuffer:
|
||||
case ValueKind::DynamicSharedPointer:
|
||||
case ValueKind::Pipe:
|
||||
case ValueKind::Image:
|
||||
return T_POINTER;
|
||||
case ValueKind::Sampler:
|
||||
return T_SAMPLER;
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
} else {
|
||||
switch (lcArg.mValueType) {
|
||||
case ValueType::I8:
|
||||
case ValueType::U8:
|
||||
sizeType = 0;
|
||||
numElements = size;
|
||||
break;
|
||||
case ValueType::I16:
|
||||
case ValueType::U16:
|
||||
sizeType = 1;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case ValueType::I32:
|
||||
case ValueType::U32:
|
||||
sizeType = 2;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case ValueType::I64:
|
||||
case ValueType::U64:
|
||||
sizeType = 3;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case ValueType::F16:
|
||||
sizeType = 4;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case ValueType::F32:
|
||||
sizeType = 4;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case ValueType::F64:
|
||||
sizeType = 5;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case ValueType::Struct:
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
switch (numElements) {
|
||||
case 1:
|
||||
return ClkValueMapType[sizeType][0];
|
||||
case 2:
|
||||
return ClkValueMapType[sizeType][1];
|
||||
case 3:
|
||||
return ClkValueMapType[sizeType][2];
|
||||
case 4:
|
||||
return ClkValueMapType[sizeType][3];
|
||||
case 8:
|
||||
return ClkValueMapType[sizeType][4];
|
||||
case 16:
|
||||
return ClkValueMapType[sizeType][5];
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
return T_VOID;
|
||||
}
|
||||
#else
|
||||
static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t size = 0) {
|
||||
uint sizeType;
|
||||
uint numElements;
|
||||
if (argInfo->type == ARG_TYPE_QUEUE) {
|
||||
return T_QUEUE;
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_POINTER || argInfo->type == ARG_TYPE_IMAGE) {
|
||||
return T_POINTER;
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_VALUE) {
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
sizeType = 0;
|
||||
numElements = size;
|
||||
break;
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_u16:
|
||||
sizeType = 1;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_u32:
|
||||
sizeType = 2;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
sizeType = 3;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case DATATYPE_f16:
|
||||
sizeType = 4;
|
||||
numElements = size / 2;
|
||||
break;
|
||||
case DATATYPE_f32:
|
||||
sizeType = 4;
|
||||
numElements = size / 4;
|
||||
break;
|
||||
case DATATYPE_f64:
|
||||
sizeType = 5;
|
||||
numElements = size / 8;
|
||||
break;
|
||||
case DATATYPE_struct:
|
||||
case DATATYPE_opaque:
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
|
||||
switch (numElements) {
|
||||
case 1:
|
||||
return ClkValueMapType[sizeType][0];
|
||||
case 2:
|
||||
return ClkValueMapType[sizeType][1];
|
||||
case 3:
|
||||
return ClkValueMapType[sizeType][2];
|
||||
case 4:
|
||||
return ClkValueMapType[sizeType][3];
|
||||
case 8:
|
||||
return ClkValueMapType[sizeType][4];
|
||||
case 16:
|
||||
return ClkValueMapType[sizeType][5];
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
||||
return T_SAMPLER;
|
||||
}
|
||||
else {
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
||||
#else
|
||||
static inline size_t GetArgAlignmentOCL(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
return 1;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8;
|
||||
case DATATYPE_struct:
|
||||
return 128;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
return sizeof(cl_mem);
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return sizeof(cl_sampler);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetArgPointeeAlignmentOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
uint32_t align = lcArg.mPointeeAlign;
|
||||
if (align == 0) {
|
||||
LogWarning("Missing DynamicSharedPointer alignment");
|
||||
align = 128; /* worst case alignment */
|
||||
}
|
||||
return align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
static inline size_t GetArgPointeeAlignmentOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
return argInfo->arg.pointer.align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline bool GetReadOnlyOCL(const KernelArgMD& lcArg) {
|
||||
if ((lcArg.mValueKind == ValueKind::GlobalBuffer) || (lcArg.mValueKind == ValueKind::Image)) {
|
||||
switch (lcArg.mAccQual) {
|
||||
case AccessQualifier::ReadOnly:
|
||||
return true;
|
||||
case AccessQualifier::WriteOnly:
|
||||
case AccessQualifier::ReadWrite:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
static inline bool GetReadOnlyOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
return (argInfo->arg.pointer.type == ACCESS_TYPE_RO) ? true : false;
|
||||
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
||||
return (argInfo->arg.image.type == ACCESS_TYPE_RO) ? true : false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline int GetArgSizeOCL(const KernelArgMD& lcArg) { return lcArg.mSize; }
|
||||
#else
|
||||
inline static int GetArgSizeOCL(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
case DATATYPE_struct:
|
||||
return 1 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
case ARG_TYPE_SAMPLER:
|
||||
case ARG_TYPE_QUEUE:
|
||||
return sizeof(void*);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
||||
}
|
||||
else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
|
||||
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
|
||||
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
||||
}
|
||||
LogError("Unsupported address type");
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Pipe) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
// default for all other cases
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
#else
|
||||
static inline cl_kernel_arg_address_qualifier GetOclAddrQualOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
case PTR_MT_CONSTANT:
|
||||
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
||||
case PTR_MT_UAV:
|
||||
case PTR_MT_GLOBAL:
|
||||
case PTR_MT_SCRATCH_EMU:
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
case PTR_MT_LDS_EMU:
|
||||
case PTR_MT_LDS:
|
||||
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
||||
case PTR_MT_ERROR:
|
||||
default:
|
||||
LogError("Unsupported address type");
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_QUEUE)) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
|
||||
// default for all other cases
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::Image) {
|
||||
switch (lcArg.mAccQual) {
|
||||
case AccessQualifier::ReadOnly:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
||||
case AccessQualifier::WriteOnly:
|
||||
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
||||
case AccessQualifier::ReadWrite:
|
||||
default:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
||||
}
|
||||
}
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
#else
|
||||
static inline cl_kernel_arg_access_qualifier GetOclAccessQualOCL(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_IMAGE) {
|
||||
switch (argInfo->arg.image.type) {
|
||||
case ACCESS_TYPE_RO:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
||||
case ACCESS_TYPE_WO:
|
||||
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
||||
default:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
||||
}
|
||||
}
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const KernelArgMD& lcArg) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
|
||||
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
if (lcArg.mIsVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (lcArg.mIsRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (lcArg.mIsConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
}
|
||||
else if (lcArg.mIsPipe) {
|
||||
assert(lcArg.mValueKind == ValueKind::Pipe);
|
||||
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
#else
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* argInfo) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
if (argInfo->arg.pointer.isVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (argInfo->arg.pointer.isRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (argInfo->arg.pointer.isPipe) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
||||
}
|
||||
if (argInfo->isConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_CONSTANT:
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argsBufferSize();
|
||||
|
||||
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
|
||||
const KernelArgMD& lcArg = kernelMD.mArgs[i];
|
||||
|
||||
size_t size = GetArgSizeOCL(lcArg);
|
||||
size_t alignment = GetArgAlignmentOCL(lcArg);
|
||||
bool isHidden = false;
|
||||
desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = size;
|
||||
offset += size;
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = lcArg.mName.c_str();
|
||||
desc.type_ = GetOclTypeOCL(lcArg, size);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
|
||||
desc.addressQualifier_ = GetOclAddrQualOCL(lcArg);
|
||||
desc.accessQualifier_ = GetOclAccessQualOCL(lcArg);
|
||||
desc.typeQualifier_ = GetOclTypeQualOCL(lcArg);
|
||||
desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(lcArg);
|
||||
desc.size_ = size;
|
||||
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
else {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = GetReadOnlyOCL(lcArg);
|
||||
|
||||
params.push_back(desc);
|
||||
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
flags_.imageEna_ = true;
|
||||
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
||||
flags_.imageWriteEna_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#else
|
||||
void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argsBufferSize();
|
||||
|
||||
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
||||
size_t size = GetArgSizeOCL(aclArg);
|
||||
size_t alignment = GetArgAlignmentOCL(aclArg);
|
||||
bool isHidden = false;
|
||||
desc.info_.oclObject_ = GetOclArgumentTypeOCL(aclArg, &isHidden);
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = size;
|
||||
offset += size;
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = aclArg->argStr;
|
||||
desc.typeName_ = aclArg->typeStr;
|
||||
desc.type_ = GetOclTypeOCL(aclArg, size);
|
||||
|
||||
desc.addressQualifier_ = GetOclAddrQualOCL(aclArg);
|
||||
desc.accessQualifier_ = GetOclAccessQualOCL(aclArg);
|
||||
desc.typeQualifier_ = GetOclTypeQualOCL(aclArg);
|
||||
desc.info_.arrayIndex_ = GetArgPointeeAlignmentOCL(aclArg);
|
||||
desc.size_ = size;
|
||||
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({ desc.offset_, offset });
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else {
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
else {
|
||||
offset = amd::alignUp(offset, alignment);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
}
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = GetReadOnlyOCL(aclArg);
|
||||
|
||||
params.push_back(desc);
|
||||
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
flags_.imageEna_ = true;
|
||||
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
||||
flags_.imageWriteEna_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
||||
if (!sym) {
|
||||
return false;
|
||||
@@ -796,7 +148,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
|
||||
(prog->isNull() ? 1
|
||||
: dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
|
||||
dev().hwInfo()->simdPerCU_) {
|
||||
hsa_ = true;
|
||||
flags_.hsa_ = true;
|
||||
}
|
||||
|
||||
HSAILKernel::~HSAILKernel() {
|
||||
@@ -849,7 +201,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
||||
return false;
|
||||
}
|
||||
// Set the argList
|
||||
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
||||
InitParameters(reinterpret_cast<const aclArgData*>(aclArgList), argsBufferSize());
|
||||
delete[] aclArgList;
|
||||
|
||||
size_t sizeOfWorkGroupSize;
|
||||
@@ -1324,7 +676,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
||||
}
|
||||
|
||||
// Set the argList
|
||||
initArgList(*kernelMD);
|
||||
InitParameters(*kernelMD, argsBufferSize());
|
||||
|
||||
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
|
||||
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
|
||||
|
||||
@@ -83,12 +83,6 @@ class HSAILKernel : public device::Kernel {
|
||||
//! Returns spill reg size per workitem
|
||||
int spillSegSize() const { return amd::alignUp(cpuAqlCode_->workitem_private_segment_byte_size, sizeof(uint32_t)); }
|
||||
|
||||
//! Returns TRUE if kernel uses dynamic parallelism
|
||||
bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
|
||||
|
||||
//! Returns TRUE if kernel is internal kernel
|
||||
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! Finds local workgroup size
|
||||
void findLocalWorkSize(size_t workDim, //!< Work dimension
|
||||
const amd::NDRange& gblWorkSize, //!< Global work size
|
||||
@@ -124,8 +118,6 @@ class HSAILKernel : public device::Kernel {
|
||||
return waveLimiter_.getWavesPerSH(vdev);
|
||||
};
|
||||
|
||||
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HSAILKernel(const HSAILKernel&);
|
||||
@@ -137,10 +129,6 @@ class HSAILKernel : public device::Kernel {
|
||||
//! Creates AQL kernel HW info
|
||||
bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);
|
||||
|
||||
//! Initializes the abstraction layer kernel parameters
|
||||
void initArgList(const aclArgData* aclArg //!< List of ACL arguments
|
||||
);
|
||||
|
||||
//! Initializes Hsail Printf metadata and info
|
||||
void initPrintf(const aclPrintfFmt* aclPrintf //!< List of ACL printfs
|
||||
);
|
||||
@@ -151,22 +139,10 @@ class HSAILKernel : public device::Kernel {
|
||||
const HSAILProgram& prog_; //!< Reference to the parent program
|
||||
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
|
||||
uint index_; //!< Kernel index in the program
|
||||
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
|
||||
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
union Flags {
|
||||
struct {
|
||||
uint imageEna_ : 1; //!< Kernel uses images
|
||||
uint imageWriteEna_ : 1; //!< Kernel uses image writes
|
||||
uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
|
||||
uint internalKernel_ : 1; //!< True: internal kernel
|
||||
};
|
||||
uint value_;
|
||||
Flags() : value_(0) {}
|
||||
} flags_;
|
||||
|
||||
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
|
||||
};
|
||||
|
||||
@@ -182,9 +158,6 @@ class LightningKernel : public HSAILKernel {
|
||||
//! Initializes the metadata required for this kernel,
|
||||
bool init(amd::hsa::loader::Symbol* symbol);
|
||||
|
||||
//! Initializes Hsail Argument metadata and info for LC
|
||||
void initArgList(const KernelMD& kernelMD);
|
||||
|
||||
//! Initializes HSAIL Printf metadata and info for LC
|
||||
void initPrintf(const std::vector<std::string>& printfInfoStrings);
|
||||
};
|
||||
|
||||
@@ -11,749 +11,6 @@
|
||||
|
||||
namespace roc {
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
using llvm::AMDGPU::HSAMD::AccessQualifier;
|
||||
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
||||
using llvm::AMDGPU::HSAMD::ValueKind;
|
||||
using llvm::AMDGPU::HSAMD::ValueType;
|
||||
|
||||
static inline ROC_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) {
|
||||
switch (lcArg.mValueKind) {
|
||||
case ValueKind::GlobalBuffer:
|
||||
case ValueKind::DynamicSharedPointer:
|
||||
case ValueKind::Pipe:
|
||||
return ROC_ARGTYPE_POINTER;
|
||||
case ValueKind::ByValue:
|
||||
return ROC_ARGTYPE_VALUE;
|
||||
case ValueKind::Image:
|
||||
return ROC_ARGTYPE_IMAGE;
|
||||
case ValueKind::Sampler:
|
||||
return ROC_ARGTYPE_SAMPLER;
|
||||
case ValueKind::Queue:
|
||||
return ROC_ARGTYPE_QUEUE;
|
||||
case ValueKind::HiddenGlobalOffsetX:
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
||||
case ValueKind::HiddenGlobalOffsetY:
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
||||
case ValueKind::HiddenGlobalOffsetZ:
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
||||
case ValueKind::HiddenPrintfBuffer:
|
||||
return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
||||
case ValueKind::HiddenDefaultQueue:
|
||||
return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
||||
case ValueKind::HiddenCompletionAction:
|
||||
return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
||||
case ValueKind::HiddenNone:
|
||||
return ROC_ARGTYPE_HIDDEN_NONE;
|
||||
default:
|
||||
return ROC_ARGTYPE_ERROR;
|
||||
}
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline ROC_ARG_TYPE GetKernelArgType(const aclArgData* argInfo) {
|
||||
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
|
||||
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
||||
} else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
||||
} else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
||||
} else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
||||
} else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
||||
} else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
|
||||
return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
||||
}
|
||||
return ROC_ARGTYPE_HIDDEN_NONE;
|
||||
}
|
||||
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return ROC_ARGTYPE_POINTER;
|
||||
case ARG_TYPE_VALUE:
|
||||
return (argInfo->arg.value.data == DATATYPE_struct) ? ROC_ARGTYPE_REFERENCE
|
||||
: ROC_ARGTYPE_VALUE;
|
||||
case ARG_TYPE_IMAGE:
|
||||
return ROC_ARGTYPE_IMAGE;
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return ROC_ARGTYPE_SAMPLER;
|
||||
case ARG_TYPE_QUEUE:
|
||||
return ROC_ARGTYPE_QUEUE;
|
||||
case ARG_TYPE_ERROR:
|
||||
default:
|
||||
return ROC_ARGTYPE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline size_t GetKernelArgAlignment(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
return 1;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8;
|
||||
case DATATYPE_struct:
|
||||
return 128;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
return sizeof(cl_mem);
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return sizeof(cl_sampler);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
uint32_t align = lcArg.mPointeeAlign;
|
||||
if (align == 0) {
|
||||
LogWarning("Missing DynamicSharedPointer alignment");
|
||||
align = 128; /* worst case alignment */
|
||||
;
|
||||
}
|
||||
return align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline size_t GetKernelArgPointeeAlignment(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
return argInfo->arg.pointer.align;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) {
|
||||
switch (lcArg.mAccQual) {
|
||||
case AccessQualifier::ReadOnly:
|
||||
return ROC_ACCESS_TYPE_RO;
|
||||
case AccessQualifier::WriteOnly:
|
||||
return ROC_ACCESS_TYPE_WO;
|
||||
case AccessQualifier::ReadWrite:
|
||||
default:
|
||||
return ROC_ACCESS_TYPE_RW;
|
||||
}
|
||||
}
|
||||
return ROC_ACCESS_TYPE_NONE;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const aclArgData* argInfo) {
|
||||
aclAccessType accessType;
|
||||
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
accessType = argInfo->arg.pointer.type;
|
||||
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
||||
accessType = argInfo->arg.image.type;
|
||||
} else {
|
||||
return ROC_ACCESS_TYPE_NONE;
|
||||
}
|
||||
if (accessType == ACCESS_TYPE_RO) {
|
||||
return ROC_ACCESS_TYPE_RO;
|
||||
} else if (accessType == ACCESS_TYPE_WO) {
|
||||
return ROC_ACCESS_TYPE_WO;
|
||||
}
|
||||
|
||||
return ROC_ACCESS_TYPE_RW;
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) {
|
||||
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
return ROC_ADDRESS_LOCAL;
|
||||
} else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
|
||||
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global || lcArg.mAddrSpaceQual == AddressSpaceQualifier::Generic) {
|
||||
return ROC_ADDRESS_GLOBAL;
|
||||
} else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
|
||||
return ROC_ADDRESS_CONSTANT;
|
||||
}
|
||||
LogError("Unsupported address type");
|
||||
return ROC_ADDRESS_ERROR;
|
||||
} else if (lcArg.mValueKind == ValueKind::Image ||
|
||||
lcArg.mValueKind == ValueKind::Sampler ||
|
||||
lcArg.mValueKind == ValueKind::Pipe) {
|
||||
return ROC_ADDRESS_GLOBAL;
|
||||
}
|
||||
return ROC_ADDRESS_ERROR;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo) {
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT:
|
||||
return ROC_ADDRESS_CONSTANT;
|
||||
case PTR_MT_UAV:
|
||||
case PTR_MT_GLOBAL:
|
||||
return ROC_ADDRESS_GLOBAL;
|
||||
case PTR_MT_LDS_EMU:
|
||||
case PTR_MT_LDS:
|
||||
return ROC_ADDRESS_LOCAL;
|
||||
case PTR_MT_ERROR:
|
||||
default:
|
||||
LogError("Unsupported address type");
|
||||
return ROC_ADDRESS_ERROR;
|
||||
}
|
||||
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
|
||||
return ROC_ADDRESS_GLOBAL;
|
||||
}
|
||||
return ROC_ADDRESS_ERROR;
|
||||
}
|
||||
|
||||
inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
|
||||
switch (arg->type_){
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case ROC_ARGTYPE_POINTER:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ROC_ARGTYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ROC_ARGTYPE_REFERENCE:
|
||||
return amd::KernelParameterDescriptor::ReferenceObject;
|
||||
case ROC_ARGTYPE_VALUE:
|
||||
return amd::KernelParameterDescriptor::ValueObject;
|
||||
case ROC_ARGTYPE_SAMPLER:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ROC_ARGTYPE_QUEUE:
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
default:
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
|
||||
aclArgDataType dataType;
|
||||
|
||||
if (lcArg.mValueKind != ValueKind::ByValue) {
|
||||
return ROC_DATATYPE_ERROR;
|
||||
}
|
||||
|
||||
switch (lcArg.mValueType) {
|
||||
case ValueType::I8:
|
||||
return ROC_DATATYPE_S8;
|
||||
case ValueType::I16:
|
||||
return ROC_DATATYPE_S16;
|
||||
case ValueType::I32:
|
||||
return ROC_DATATYPE_S32;
|
||||
case ValueType::I64:
|
||||
return ROC_DATATYPE_S64;
|
||||
case ValueType::U8:
|
||||
return ROC_DATATYPE_U8;
|
||||
case ValueType::U16:
|
||||
return ROC_DATATYPE_U16;
|
||||
case ValueType::U32:
|
||||
return ROC_DATATYPE_U32;
|
||||
case ValueType::U64:
|
||||
return ROC_DATATYPE_U64;
|
||||
case ValueType::F16:
|
||||
return ROC_DATATYPE_F16;
|
||||
case ValueType::F32:
|
||||
return ROC_DATATYPE_F32;
|
||||
case ValueType::F64:
|
||||
return ROC_DATATYPE_F64;
|
||||
case ValueType::Struct:
|
||||
return ROC_DATATYPE_STRUCT;
|
||||
default:
|
||||
return ROC_DATATYPE_ERROR;
|
||||
}
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
/* f16 returns f32 - workaround due to comp lib */
|
||||
static inline ROC_DATA_TYPE GetKernelDataType(const aclArgData* argInfo) {
|
||||
aclArgDataType dataType;
|
||||
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
dataType = argInfo->arg.pointer.data;
|
||||
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
||||
dataType = argInfo->arg.value.data;
|
||||
} else {
|
||||
return ROC_DATATYPE_ERROR;
|
||||
}
|
||||
switch (dataType) {
|
||||
case DATATYPE_i1:
|
||||
return ROC_DATATYPE_B1;
|
||||
case DATATYPE_i8:
|
||||
return ROC_DATATYPE_S8;
|
||||
case DATATYPE_i16:
|
||||
return ROC_DATATYPE_S16;
|
||||
case DATATYPE_i32:
|
||||
return ROC_DATATYPE_S32;
|
||||
case DATATYPE_i64:
|
||||
return ROC_DATATYPE_S64;
|
||||
case DATATYPE_u8:
|
||||
return ROC_DATATYPE_U8;
|
||||
case DATATYPE_u16:
|
||||
return ROC_DATATYPE_U16;
|
||||
case DATATYPE_u32:
|
||||
return ROC_DATATYPE_U32;
|
||||
case DATATYPE_u64:
|
||||
return ROC_DATATYPE_U64;
|
||||
case DATATYPE_f16:
|
||||
return ROC_DATATYPE_F32;
|
||||
case DATATYPE_f32:
|
||||
return ROC_DATATYPE_F32;
|
||||
case DATATYPE_f64:
|
||||
return ROC_DATATYPE_F64;
|
||||
case DATATYPE_struct:
|
||||
return ROC_DATATYPE_STRUCT;
|
||||
case DATATYPE_opaque:
|
||||
return ROC_DATATYPE_OPAQUE;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return ROC_DATATYPE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int GetKernelArgSize(const aclArgData* argInfo) {
|
||||
switch (argInfo->type) {
|
||||
case ARG_TYPE_POINTER:
|
||||
return sizeof(void*);
|
||||
case ARG_TYPE_VALUE:
|
||||
switch (argInfo->arg.value.data) {
|
||||
case DATATYPE_i8:
|
||||
case DATATYPE_u8:
|
||||
case DATATYPE_struct:
|
||||
return 1 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u16:
|
||||
case DATATYPE_i16:
|
||||
case DATATYPE_f16:
|
||||
return 2 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_u32:
|
||||
case DATATYPE_i32:
|
||||
case DATATYPE_f32:
|
||||
return 4 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_i64:
|
||||
case DATATYPE_u64:
|
||||
case DATATYPE_f64:
|
||||
return 8 * argInfo->arg.value.numElements;
|
||||
case DATATYPE_ERROR:
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
case ARG_TYPE_IMAGE:
|
||||
return sizeof(cl_mem);
|
||||
case ARG_TYPE_SAMPLER:
|
||||
return sizeof(cl_sampler);
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static inline clk_value_type_t GetOclType(const Kernel::Argument* arg) {
|
||||
static const clk_value_type_t ClkValueMapType[6][6] = {
|
||||
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
||||
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
|
||||
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
|
||||
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
|
||||
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
|
||||
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
|
||||
};
|
||||
|
||||
uint sizeType;
|
||||
uint numElements;
|
||||
if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
return T_POINTER;
|
||||
} else if (arg->type_ == ROC_ARGTYPE_VALUE || arg->type_ == ROC_ARGTYPE_REFERENCE) {
|
||||
switch (arg->dataType_) {
|
||||
case ROC_DATATYPE_S8:
|
||||
case ROC_DATATYPE_U8:
|
||||
sizeType = 0;
|
||||
numElements = arg->size_;
|
||||
break;
|
||||
case ROC_DATATYPE_S16:
|
||||
case ROC_DATATYPE_U16:
|
||||
sizeType = 1;
|
||||
numElements = arg->size_ / 2;
|
||||
break;
|
||||
case ROC_DATATYPE_S32:
|
||||
case ROC_DATATYPE_U32:
|
||||
sizeType = 2;
|
||||
numElements = arg->size_ / 4;
|
||||
break;
|
||||
case ROC_DATATYPE_S64:
|
||||
case ROC_DATATYPE_U64:
|
||||
sizeType = 3;
|
||||
numElements = arg->size_ / 8;
|
||||
break;
|
||||
case ROC_DATATYPE_F16:
|
||||
sizeType = 4;
|
||||
numElements = arg->size_ / 2;
|
||||
break;
|
||||
case ROC_DATATYPE_F32:
|
||||
sizeType = 4;
|
||||
numElements = arg->size_ / 4;
|
||||
break;
|
||||
case ROC_DATATYPE_F64:
|
||||
sizeType = 5;
|
||||
numElements = arg->size_ / 8;
|
||||
break;
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
|
||||
switch (numElements) {
|
||||
case 1:
|
||||
return ClkValueMapType[sizeType][0];
|
||||
case 2:
|
||||
return ClkValueMapType[sizeType][1];
|
||||
case 3:
|
||||
return ClkValueMapType[sizeType][2];
|
||||
case 4:
|
||||
return ClkValueMapType[sizeType][3];
|
||||
case 8:
|
||||
return ClkValueMapType[sizeType][4];
|
||||
case 16:
|
||||
return ClkValueMapType[sizeType][5];
|
||||
default:
|
||||
return T_VOID;
|
||||
}
|
||||
} else if (arg->type_ == ROC_ARGTYPE_SAMPLER) {
|
||||
return T_SAMPLER;
|
||||
} else if (arg->type_ == ROC_ARGTYPE_QUEUE) {
|
||||
return T_QUEUE;
|
||||
} else {
|
||||
return T_VOID;
|
||||
}
|
||||
}
|
||||
|
||||
static inline cl_kernel_arg_address_qualifier GetOclAddrQual(const Kernel::Argument* arg) {
|
||||
if (arg->type_ == ROC_ARGTYPE_POINTER) {
|
||||
switch (arg->addrQual_) {
|
||||
case ROC_ADDRESS_GLOBAL:
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
case ROC_ADDRESS_CONSTANT:
|
||||
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
||||
case ROC_ADDRESS_LOCAL:
|
||||
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
||||
default:
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
} else if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
||||
}
|
||||
// default for all other cases
|
||||
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
||||
}
|
||||
|
||||
static inline cl_kernel_arg_access_qualifier GetOclAccessQual(const Kernel::Argument* arg) {
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
switch (arg->access_) {
|
||||
case ROC_ACCESS_TYPE_RO:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
||||
case ROC_ACCESS_TYPE_WO:
|
||||
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
||||
case ROC_ACCESS_TYPE_RW:
|
||||
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
||||
default:
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
}
|
||||
return CL_KERNEL_ARG_ACCESS_NONE;
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
|
||||
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
||||
if (lcArg.mIsVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (lcArg.mIsRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (lcArg.mIsConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
}
|
||||
else if (lcArg.mIsPipe) {
|
||||
assert(lcArg.mValueKind == ValueKind::Pipe);
|
||||
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
|
||||
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
if (argInfo->arg.pointer.isVolatile) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
||||
}
|
||||
if (argInfo->arg.pointer.isRestrict) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
||||
}
|
||||
if (argInfo->isConst) {
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
}
|
||||
switch (argInfo->arg.pointer.memory) {
|
||||
case PTR_MT_CONSTANT:
|
||||
case PTR_MT_UAV_CONSTANT:
|
||||
case PTR_MT_CONSTANT_EMU:
|
||||
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
size_t offsetStruct = KernargSegmentByteSize();
|
||||
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
|
||||
// Initialize HSAIL kernel argument
|
||||
Kernel::Argument* arg = new Kernel::Argument;
|
||||
arg->name_ = aclArg->argStr;
|
||||
arg->typeName_ = aclArg->typeStr;
|
||||
arg->size_ = GetKernelArgSize(aclArg);
|
||||
arg->type_ = GetKernelArgType(aclArg);
|
||||
arg->addrQual_ = GetKernelAddrQual(aclArg);
|
||||
arg->dataType_ = GetKernelDataType(aclArg);
|
||||
arg->alignment_ = GetKernelArgAlignment(aclArg);
|
||||
arg->access_ = GetKernelArgAccessType(aclArg);
|
||||
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(aclArg);
|
||||
|
||||
bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE;
|
||||
|
||||
arg->index_ = isHidden ? uint(-1) : params.size();
|
||||
hsailArgList_.push_back(arg);
|
||||
|
||||
amd::KernelParameterDescriptor desc;
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arg->size_;
|
||||
offset += arg->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = arg->name_.c_str();
|
||||
desc.type_ = GetOclType(arg);
|
||||
desc.addressQualifier_ = GetOclAddrQual(arg);
|
||||
desc.accessQualifier_ = GetOclAccessQual(arg);
|
||||
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
||||
desc.typeName_ = arg->typeName_.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
flags_.imageEnable_ = true;
|
||||
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
|
||||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
|
||||
flags_.imageWrite_ = true;
|
||||
}
|
||||
}
|
||||
desc.size_ = arg->size_;
|
||||
|
||||
// Make offset alignment to match CPU metadata, since
|
||||
// in multidevice config abstraction layer has a single signature
|
||||
// and CPU sends the parameters as they are allocated in memory
|
||||
size_t size = desc.size_;
|
||||
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({desc.offset_, offset});
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
// These objects have forced data size to uint64_t
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
size_t offsetStruct = KernargSegmentByteSize();
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
|
||||
const KernelArgMD& lcArg = kernelMD.mArgs[i];
|
||||
|
||||
// Initialize HSAIL kernel argument
|
||||
Kernel::Argument* arg = new Kernel::Argument;
|
||||
arg->name_ = lcArg.mName;
|
||||
arg->typeName_ = lcArg.mTypeName;
|
||||
arg->size_ = lcArg.mSize;
|
||||
arg->type_ = GetKernelArgType(lcArg);
|
||||
arg->addrQual_ = GetKernelAddrQual(lcArg);
|
||||
arg->dataType_ = GetKernelDataType(lcArg);
|
||||
arg->alignment_ = GetKernelArgAlignment(lcArg);
|
||||
arg->access_ = GetKernelArgAccessType(lcArg);
|
||||
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
|
||||
|
||||
bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
||||
arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE;
|
||||
|
||||
arg->index_ = isHidden ? uint(-1) : params.size();
|
||||
hsailArgList_.push_back(arg);
|
||||
|
||||
// Initialize Device kernel parameters
|
||||
amd::KernelParameterDescriptor desc;
|
||||
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arg->size_;
|
||||
offset += arg->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = lcArg.mName.c_str();
|
||||
desc.type_ = GetOclType(arg);
|
||||
desc.addressQualifier_ = GetOclAddrQual(arg);
|
||||
desc.accessQualifier_ = GetOclAccessQual(arg);
|
||||
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
flags_.imageEnable_ = true;
|
||||
if (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_WRITE_ONLY ||
|
||||
desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_WRITE) {
|
||||
flags_.imageWrite_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
desc.size_ = arg->size_;
|
||||
|
||||
// Make offset alignment to match CPU metadata, since
|
||||
// in multidevice config abstraction layer has a single signature
|
||||
// and CPU sends the parameters as they are allocated in memory
|
||||
size_t size = desc.size_;
|
||||
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({desc.offset_, offset});
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
// These objects have forced data size to uint64_t
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
@@ -787,7 +44,7 @@ bool LightningKernel::init() {
|
||||
if (kernelMD == nullptr) {
|
||||
return false;
|
||||
}
|
||||
initArguments(*kernelMD);
|
||||
InitParameters(*kernelMD, KernargSegmentByteSize());
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
|
||||
@@ -907,7 +164,7 @@ bool HSAILKernel::init() {
|
||||
}
|
||||
|
||||
// Set the argList
|
||||
initArguments((const aclArgData*)argList.get());
|
||||
InitParameters((const aclArgData*)argList.get(), KernargSegmentByteSize());
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
|
||||
@@ -1151,11 +408,6 @@ void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) {
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
|
||||
Kernel::~Kernel() {
|
||||
while (!hsailArgList_.empty()) {
|
||||
Argument* kernelArgPointer = hsailArgList_.back();
|
||||
delete kernelArgPointer;
|
||||
hsailArgList_.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace roc
|
||||
|
||||
@@ -15,77 +15,8 @@ namespace roc {
|
||||
|
||||
#define MAX_INFO_STRING_LEN 0x40
|
||||
|
||||
enum ROC_ARG_TYPE {
|
||||
ROC_ARGTYPE_ERROR = 0,
|
||||
ROC_ARGTYPE_POINTER,
|
||||
ROC_ARGTYPE_VALUE,
|
||||
ROC_ARGTYPE_REFERENCE,
|
||||
ROC_ARGTYPE_IMAGE,
|
||||
ROC_ARGTYPE_SAMPLER,
|
||||
ROC_ARGTYPE_QUEUE,
|
||||
ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X,
|
||||
ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y,
|
||||
ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z,
|
||||
ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER,
|
||||
ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE,
|
||||
ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION,
|
||||
ROC_ARGTYPE_HIDDEN_NONE,
|
||||
ROC_ARGMAX_ARG_TYPES
|
||||
};
|
||||
|
||||
enum ROC_ADDRESS_QUALIFIER {
|
||||
ROC_ADDRESS_ERROR = 0,
|
||||
ROC_ADDRESS_GLOBAL,
|
||||
ROC_ADDRESS_CONSTANT,
|
||||
ROC_ADDRESS_LOCAL,
|
||||
ROC_MAX_ADDRESS_QUALIFIERS
|
||||
};
|
||||
|
||||
enum ROC_DATA_TYPE {
|
||||
ROC_DATATYPE_ERROR = 0,
|
||||
ROC_DATATYPE_B1,
|
||||
ROC_DATATYPE_B8,
|
||||
ROC_DATATYPE_B16,
|
||||
ROC_DATATYPE_B32,
|
||||
ROC_DATATYPE_B64,
|
||||
ROC_DATATYPE_S8,
|
||||
ROC_DATATYPE_S16,
|
||||
ROC_DATATYPE_S32,
|
||||
ROC_DATATYPE_S64,
|
||||
ROC_DATATYPE_U8,
|
||||
ROC_DATATYPE_U16,
|
||||
ROC_DATATYPE_U32,
|
||||
ROC_DATATYPE_U64,
|
||||
ROC_DATATYPE_F16,
|
||||
ROC_DATATYPE_F32,
|
||||
ROC_DATATYPE_F64,
|
||||
ROC_DATATYPE_STRUCT,
|
||||
ROC_DATATYPE_OPAQUE,
|
||||
ROC_DATATYPE_MAX_TYPES
|
||||
};
|
||||
|
||||
enum ROC_ACCESS_TYPE {
|
||||
ROC_ACCESS_TYPE_NONE = 0,
|
||||
ROC_ACCESS_TYPE_RO,
|
||||
ROC_ACCESS_TYPE_WO,
|
||||
ROC_ACCESS_TYPE_RW
|
||||
};
|
||||
|
||||
class Kernel : public device::Kernel {
|
||||
public:
|
||||
struct Argument {
|
||||
uint index_; //!< Argument's index in the OCL signature
|
||||
std::string name_; //!< Argument's name
|
||||
std::string typeName_; //!< Argument's type name
|
||||
uint size_; //!< Size in bytes
|
||||
uint alignment_; //!< Argument's alignment
|
||||
uint pointeeAlignment_; //!< Alignment of the data pointed to
|
||||
ROC_ARG_TYPE type_; //!< Type of the argument
|
||||
ROC_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
|
||||
ROC_DATA_TYPE dataType_; //!< The type of data
|
||||
ROC_ACCESS_TYPE access_; //!< Access type for the argument
|
||||
};
|
||||
|
||||
Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
@@ -97,7 +28,7 @@ class Kernel : public device::Kernel {
|
||||
|
||||
const uint32_t workitemPrivateSegmentByteSize() const { return workitemPrivateSegmentByteSize_; }
|
||||
|
||||
const uint64_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; }
|
||||
const uint32_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; }
|
||||
|
||||
const uint8_t KernargSegmentAlignment() const { return kernargSegmentAlignment_; }
|
||||
|
||||
@@ -108,63 +39,18 @@ class Kernel : public device::Kernel {
|
||||
|
||||
const Program* program() const { return static_cast<const Program*>(program_); }
|
||||
|
||||
//! Returns the kernel argument list
|
||||
const std::vector<Argument*>& hsailArgs() const { return hsailArgList_; }
|
||||
|
||||
//! Returns a pointer to the hsail argument at the specified index
|
||||
Argument* hsailArgAt(size_t index) const {
|
||||
for (auto arg : hsailArgList_)
|
||||
if (arg->index_ == index) return arg;
|
||||
assert(!"Should not reach here");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//! Return printf info array
|
||||
const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
|
||||
|
||||
//! Returns TRUE if kernel uses dynamic parallelism
|
||||
bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; }
|
||||
|
||||
//! set dynamic parallelism flag
|
||||
void setDynamicParallelFlag(bool flag) { flags_.dynamicParallelism_ = flag; }
|
||||
|
||||
//! Return TRUE if kernel is internal blit kernel
|
||||
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! set internal kernel flag
|
||||
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
|
||||
|
||||
//! Return TRUE if kernel uses images
|
||||
bool imageEnable() const { return (flags_.imageEnable_) ? true : false; }
|
||||
|
||||
//! Return TRUE if kernel wirtes images
|
||||
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
|
||||
|
||||
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
|
||||
|
||||
protected:
|
||||
union Flags {
|
||||
struct {
|
||||
uint internalKernel_ : 1; //!< Is a blit kernel?
|
||||
uint imageEnable_ : 1; //!< Kernel uses images
|
||||
uint imageWrite_ : 1; //!< Kernel writes images
|
||||
uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled
|
||||
};
|
||||
uint value_;
|
||||
Flags() : value_(0) {}
|
||||
} flags_;
|
||||
|
||||
|
||||
Program* program_; //!< The roc::Program context
|
||||
std::vector<Argument*> hsailArgList_; //!< Vector list of HSAIL Arguments
|
||||
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
|
||||
Program* program_; //!< The roc::Program context
|
||||
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
|
||||
const uint32_t workgroupGroupSegmentByteSize_;
|
||||
const uint32_t workitemPrivateSegmentByteSize_;
|
||||
const uint32_t kernargSegmentByteSize_;
|
||||
const uint32_t kernargSegmentAlignment_;
|
||||
size_t kernelDirectiveOffset_;
|
||||
std::vector<PrintfInfo> printf_;
|
||||
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
|
||||
};
|
||||
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
@@ -183,9 +69,6 @@ class HSAILKernel : public roc::Kernel {
|
||||
virtual bool init() final;
|
||||
|
||||
private:
|
||||
//! Populates hsailArgList_
|
||||
void initArguments(const aclArgData* aclArg);
|
||||
|
||||
//! Initializes HSAIL Printf metadata and info
|
||||
void initPrintf(const aclPrintfFmt* aclPrintf);
|
||||
};
|
||||
@@ -206,9 +89,6 @@ class LightningKernel : public roc::Kernel {
|
||||
virtual bool init() final;
|
||||
|
||||
private:
|
||||
//! Initializes Hsail Argument metadata and info for LC
|
||||
void initArguments(const KernelMD& kernelMD);
|
||||
|
||||
//! Initializes HSAIL Printf metadata and info for LC
|
||||
void initPrintf(const std::vector<std::string>& printfInfoStrings);
|
||||
};
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user