P4 to Git Change 1567428 by gandryey@gera-w8 on 2018/06/12 18:39:23

SWDEV-79445 - OCL generic changes and code clean-up
	- Optimize setup of kernel arguments. Stage 2.
	- Add HW ABI support in the abstraction layer
	- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and  patching of arguments.

	http://ocltc.amd.com/reviews/r/15122/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#221 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#307 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#325 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#95 edit


[ROCm/clr commit: 1be400ff01]
This commit is contained in:
foreman
2018-06-12 18:57:20 -04:00
parent 028339d9be
commit 665aab7ca4
13 changed files with 430 additions and 320 deletions
@@ -600,7 +600,9 @@ Settings::Settings() {
//!< concurrent Virtual GPUs for default
}
bool Kernel::createSignature(const parameters_t& params) {
bool Kernel::createSignature(
const parameters_t& params, const parameters_t& hiddenParams,
uint32_t version) {
std::stringstream attribs;
if (workGroupInfo_.compileSize_[0] != 0) {
attribs << "reqd_work_group_size(";
@@ -632,7 +634,7 @@ bool Kernel::createSignature(const parameters_t& params) {
// Destroy old signature if it was allocated before
// (offline devices path)
delete signature_;
signature_ = new amd::KernelSignature(params, attribs.str());
signature_ = new amd::KernelSignature(params, attribs.str(), hiddenParams, version);
if (NULL != signature_) {
return true;
}
+20 -2
View File
@@ -852,7 +852,9 @@ class Kernel : public amd::HeapObject {
const std::string& name() const { return name_; }
//! Initializes the kernel parameters for the abstraction layer
bool createSignature(const parameters_t& params);
bool createSignature(
const parameters_t& params, const parameters_t& hiddenParams,
uint32_t version);
//! Returns TRUE if it's a HSA kernel
bool hsa() const { return hsa_; }
@@ -1624,6 +1626,22 @@ class Device : public RuntimeObject {
};
struct KernelParameterDescriptor {
enum {
Value = 0,
HiddenNone = 1,
HiddenGlobalOffsetX = 2,
HiddenGlobalOffsetY = 3,
HiddenGlobalOffsetZ = 4,
HiddenPrintfBuffer = 5,
HiddenDefaultQueue = 6,
HiddenCompletionAction = 7,
MemoryObject = 8,
ReferenceObject = 9,
ValueObject = 10,
ImageObject = 11,
SamplerObject = 12,
QueueObject = 13
};
const char* name_; //!< The parameter's name in the source
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
@@ -1642,7 +1660,7 @@ struct KernelParameterDescriptor {
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
uint32_t defined_ : 1; //!< The argument was defined by the app
uint32_t reserved_ : 1; //!< reserved
uint32_t arrayIndex_ : 28; //!< Index in the objects array
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
};
uint32_t allValues_;
InfoData() : allValues_(0) {}
@@ -752,7 +752,8 @@ bool NullKernel::create(const std::string& code, const std::string& metadata,
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
if (!createSignature(params)) {
device::Kernel::parameters_t hiddenParams;
if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
return false;
}
@@ -1337,7 +1338,8 @@ bool Kernel::initParameters() {
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
if (!createSignature(params)) {
device::Kernel::parameters_t hiddenParams;
if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
return false;
}
@@ -3017,7 +3019,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
}
}
createSignature(params);
device::Kernel::parameters_t hiddenParams;
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
@@ -943,24 +943,30 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
uint32_t uint32_value = 0;
uint64_t uint64_value = 0;
size_t argSize = desc.size_;
if (desc.type_ == T_POINTER && desc.size_ != 0) {
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
LP64_SWITCH(uint32_value, uint64_value) = 0;
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
} else {
// convert cl_mem to amd::Memory*, return false if invalid.
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
*static_cast<Memory* const*>(value))->vmAddress());
*static_cast<Memory* const*>(value))->virtualAddress());
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
*static_cast<Memory* const*>(value);
// Note: Special case for image SRD, which is 64 bit always
if (LP64_SWITCH(true, false) &&
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
uint64_value = uint32_value;
argSize = sizeof(uint64_t);
}
}
} else if (desc.type_ == T_SAMPLER) {
assert(false && "No sampler support in blit manager! Use internal samplers!");
} else
switch (desc.size_) {
switch (argSize) {
case 1:
uint32_value = *static_cast<const uint8_t*>(value);
break;
@@ -977,7 +983,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
break;
}
switch (desc.size_) {
switch (argSize) {
case 0 /*local mem*/:
*static_cast<size_t*>(param) = size;
break;
@@ -228,6 +228,37 @@ inline static int GetHSAILArgSize(const aclArgData* argInfo) {
}
}
inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
switch (arg->type_){
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER:
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
return amd::KernelParameterDescriptor::HiddenCompletionAction;
case HSAIL_ARGTYPE_POINTER:
return amd::KernelParameterDescriptor::MemoryObject;
case HSAIL_ARGTYPE_IMAGE:
return amd::KernelParameterDescriptor::ImageObject;
case HSAIL_ARGTYPE_REFERENCE:
return amd::KernelParameterDescriptor::ReferenceObject;
case HSAIL_ARGTYPE_VALUE:
return amd::KernelParameterDescriptor::ValueObject;
case HSAIL_ARGTYPE_SAMPLER:
return amd::KernelParameterDescriptor::SamplerObject;
case HSAIL_ARGTYPE_QUEUE:
return amd::KernelParameterDescriptor::QueueObject;
default:
return amd::KernelParameterDescriptor::HiddenNone;
}
}
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
static const clk_value_type_t ClkValueMapType[6][6] = {
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
@@ -422,12 +453,22 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
size_t offsetStruct = argsBufferSize();
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
// skip the hidden arguments
if (arguments_[i]->index_ == uint(-1)) continue;
// Allocate the hidden arguments, but abstraction layer will skip them
if (arguments_[i]->index_ == uint(-1)) {
offset = amd::alignUp(offset, arguments_[i]->alignment_);
desc.offset_ = offset;
desc.size_ = arguments_[i]->size_;
offset += arguments_[i]->size_;
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
hiddenParams.push_back(desc);
continue;
}
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(arguments_[i]);
@@ -435,6 +476,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
desc.info_.arrayIndex_ = arguments_[i]->pointeeAlignment_;
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -451,9 +494,32 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
// Check if HSAIL expects data by reference and allocate it behind
if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
desc.offset_ = offsetStruct;
// Align the offset reference
offset = amd::alignUp(offset, sizeof(size_t));
patchReferences_.insert({desc.offset_, offset});
offsetStruct += size;
// Adjust the offset of arguments
offset += sizeof(size_t);
} else {
// These objects have forced data size to uint64_t
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
offset = amd::alignUp(offset, sizeof(uint64_t));
desc.offset_ = offset;
offset += sizeof(uint64_t);
} else {
offset = amd::alignUp(offset, arguments_[i]->alignment_);
desc.offset_ = offset;
offset += size;
}
}
// Update read only flag
desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
@@ -464,7 +530,7 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
}
}
createSignature(params);
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
@@ -869,247 +935,79 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
}
}
template <typename T>
inline void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment //!< The alignment to follow while writing to the buffer
) {
*dst = amd::alignUp(*dst, alignment);
memcpy(*dst, src, size);
*dst += size;
}
template <>
inline void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment //!< The alignment to follow while writing to the buffer
) {
*dst = amd::alignUp(*dst, alignment);
*(reinterpret_cast<uint32_t*>(*dst)) = *src;
*dst += size;
}
template <>
inline void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment //!< The alignment to follow while writing to the buffer
) {
*dst = amd::alignUp(*dst, alignment);
*(reinterpret_cast<uint64_t*>(*dst)) = *src;
*dst += size;
}
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->SysMemCopy();
bool srdResource = false;
const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
uint64_t argList;
address aqlArgBuf = gpu.managedBuffer().reserve(
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap wrap = {};
wrap.state = AQL_WRAP_BUSY;
const ConstantBuffer* cb = gpu.cb(1);
*vmParentWrap = cb->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
gpu.addVmMemory(cb->ActiveMemory());
*vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
}
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
// Find all parameters for the current kernel
for (auto arg : arguments_) {
const_address paramaddr = nullptr;
if (arg->index_ != uint(-1)) {
paramaddr = parameters + signature.at(arg->index_).offset_;
}
// Handle the hidden arguments first, as they do not have a
// matching parameter in the OCL signature (not a valid arg->index_)
switch (arg->type_) {
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
// Check if runtime has to setup hidden arguments
for (const auto& it : signature.hiddenParameters()) {
size_t offset;
switch (it.info_.oclObject_) {
case amd::KernelParameterDescriptor::HiddenNone:
//WriteAqlArgAt(aqlArgBuf, &zero, it.size_, it.offset_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
offset = sizes.offset()[0];
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
if (sizes.dimensions() >= 2) {
offset = sizes.offset()[1];
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
}
break;
}
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
size_t bufferPtr = 0;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
if (sizes.dimensions() >= 3) {
offset = sizes.offset()[2];
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
}
break;
case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
// and set the fourth argument as the printf_buffer pointer
bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
dbgBuffer()->vmAddress());
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
}
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
break;
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
break;
case HSAIL_ARGTYPE_HIDDEN_NONE: {
void* zero = 0;
assert(arg->size_ <= sizeof(zero) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_POINTER: {
// If it is a local pointer
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
break;
}
assert(
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
"Unsupported address qualifier");
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
break;
}
case HSAIL_ARGTYPE_REFERENCE: {
const ConstantBuffer* cb = gpu.cb(1);
// Copy the current structure into CB1
size_t gpuPtr = static_cast<size_t>(cb->UploadDataToHw(paramaddr, arg->size_));
// Then use a pointer in aqlArgBuffer to CB1
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t), sizeof(size_t));
gpu.addVmMemory(cb->ActiveMemory());
break;
}
case HSAIL_ARGTYPE_VALUE:
if (arg->size_ == sizeof(uint32_t)) {
WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint32_t*>(paramaddr),
sizeof(uint32_t), arg->alignment_);
} else if (arg->size_ == sizeof(uint64_t)) {
WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint64_t*>(paramaddr),
sizeof(uint64_t), arg->alignment_);
} else {
WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
}
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
if (nativeMem) {
image = reinterpret_cast<Image* const*>(memories)[index];
if (nullptr != image) {
mem = image->owner();
}
} else {
mem = memories[index];
if (mem != nullptr) {
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
}
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current image SRD into CB1
const ConstantBuffer* cb = gpu.cb(1);
uint64_t srd = cb->UploadDataToHw(image->hwState(), HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
gpu.addVmMemory(cb->ActiveMemory());
} else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
srdResource = true;
}
if (image->desc().isDoppTexture_) {
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
case amd::KernelParameterDescriptor::HiddenDefaultQueue:
if (vmDefQueue != 0) {
WriteAqlArgAt(const_cast<address>(parameters), &vmDefQueue, it.size_, it.offset_);
}
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
kernelParams.samplerObjOffset())[index];
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
parameters + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!gpu.createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return nullptr;
}
vmQueue = gpu.vQueue()->vmAddress();
case amd::KernelParameterDescriptor::HiddenCompletionAction:
if (*vmParentWrap != 0) {
WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
}
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue), sizeof(vmQueue));
break;
}
default:
LogError(" Unsupported argument type ");
return nullptr;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return nullptr;
}
// Load all kernel arguments
WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize(), 0);
assert(argsBufferSize() == amd::alignUp(signature.paramsSize(), 16) &&
"A mismatch of sizes of arguments between compiler and runtime!");
#if defined(WITH_LIGHTNING_COMPILER)
// Check there is no arguments' buffer overflow. We may not use all the
// hidden argument slots.
assert(aqlArgBuf <= (gpu.cb(0)->SysMemCopy() + argsBufferSize()));
#else // !defined(WITH_LIGHTNING_COMPILER)
// HSAIL kernarg segment size is rounded up to multiple of 16.
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
assert((aqlArgBuf == (gpu.cb(0)->SysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
#endif // !defined(WITH_LIGHTNING_COMPILER)
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy() + argsBufferSize());
//hsa_kernel_dispatch_packet_t disp;
hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
gpu.cb(0)->SysMemCopy());
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
@@ -1117,6 +1015,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// Check if runtime has to find local workgroup size
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
constexpr uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
@@ -1134,28 +1038,16 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
hsaDisp->group_segment_size = ldsAddress - ldsSize();
hsaDisp->kernel_object = gpuAqlCode();
const ConstantBuffer* cb = gpu.cb(0);
uint64_t argList = cb->UploadDataToHw(
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
hsaDisp->reserved2 = 0;
hsaDisp->completion_signal.handle = 0;
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
gpu.addVmMemory(cb->ActiveMemory());
gpu.addVmMemory(&prog().codeSegGpu());
for (pal::Memory* mem : prog().globalStores()) {
gpu.addVmMemory(mem);
}
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
gpu.addVmMemory(gpu.hsaQueueMem());
}
if (srdResource || prog().isStaticSampler()) {
dev().srds().fillResourceList(gpu);
}
return hsaDisp;
}
@@ -1398,6 +1290,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcA
void LightningKernel::initArgList(const KernelMD& kernelMD) {
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
size_t offsetStruct = argsBufferSize();
size_t offset = 0;
@@ -1426,20 +1320,27 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
arg->index_ = isHidden ? uint(-1) : params.size();
arguments_.push_back(arg);
if (isHidden) {
continue;
}
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
if (isHidden) {
offset = amd::alignUp(offset, arguments_[i]->alignment_);
desc.offset_ = offset;
desc.size_ = arguments_[i]->size_;
offset += arguments_[i]->size_;
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
hiddenParams.push_back(desc);
continue;
}
desc.name_ = lcArg.mName.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
desc.info_.oclObject_ = GetOclArgumentType(arg);
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -1456,14 +1357,37 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
// Check if HSAIL expects data by reference and allocate it behind
if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
desc.offset_ = offsetStruct;
// Align the offset reference
offset = amd::alignUp(offset, sizeof(size_t));
patchReferences_.insert({ desc.offset_, offset });
offsetStruct += size;
// Adjust the offset of arguments
offset += sizeof(size_t);
}
else {
// These objects have forced data size to uint64_t
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
offset = amd::alignUp(offset, sizeof(uint64_t));
desc.offset_ = offset;
offset += sizeof(uint64_t);
} else {
offset = amd::alignUp(offset, arguments_[i]->alignment_);
desc.offset_ = offset;
offset += size;
}
}
// Update read only flag
desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
createSignature(params);
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
}
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
@@ -182,7 +182,7 @@ class HSAILKernel : public device::Kernel {
const amd::Kernel& kernel, //!< AMD kernel object
const amd::NDRangeContainer& sizes, //!< NDrange container
const_address parameters, //!< Application arguments for the kernel
bool nativeMem, //!< Native memory objects are passed
size_t ldsAddress, //!< LDS address that includes all arguments.
uint64_t vmDefQueue, //!< GPU VM default queue pointer
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
) const;
@@ -204,6 +204,8 @@ class HSAILKernel : public device::Kernel {
return waveLimiter_.getWavesPerSH(vdev);
};
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -234,6 +236,7 @@ class HSAILKernel : public device::Kernel {
const HSAILProgram& prog_; //!< Reference to the parent program
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
uint index_; //!< Kernel index in the program
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
@@ -219,6 +219,8 @@ class Image : public pal::Memory {
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
virtual uint64_t virtualAddress() const override { return hwSrd(); }
private:
//! Disable copy constructor
Image(const Image&);
@@ -461,9 +461,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (flushL1Cache) {
// Flush cache
if (!gpu.profiling()) {
gpu.addBarrier();
gpu.addBarrier();
}
// Clear memory dependency state
const static bool All = true;
clear(!All);
@@ -2112,13 +2111,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
}
size_t ldsSize;
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
LogError("Wrong memory objects!");
return false;
}
bool needFlush = false;
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
// are in the same cmdBuffer
@@ -2194,7 +2192,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
*this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
*this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2948,7 +2946,7 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
}
bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
bool nativeMem) {
bool nativeMem, size_t& ldsAddress) {
const amd::KernelParameters& kernelParams = kernel.parameters();
// Mark the tracker with a new kernel,
@@ -3015,68 +3013,155 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
}
bool srdResource = false;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
const HSAILKernel& hsaKernel =
static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
const amd::KernelSignature& signature = kernel.signature();
ldsAddress = hsaKernel.ldsSize();
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
Memory* gpuMem = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
if (nullptr != gpuMem) {
mem = gpuMem->owner();
}
} else {
mem = memories[index];
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
// Synchronize data with other memory instances if necessary
gpuMem->syncCacheFromHost(*this);
}
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
addBarrier();
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
} else if (gpuMem != nullptr) {
// Check image
bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
// Check buffer
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, readOnly);
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
constexpr bool WaitOnBusyEngine = true;
gpuMem->wait(*this, WaitOnBusyEngine);
//! Check if compiler expects read/write
if ((mem != nullptr) && !desc.info_.readOnly_) {
mem->signalWrite(&dev());
}
addVmMemory(gpuMem);
if (!nativeMem) {
// Process cache coherency first, since the extra transfers may affect
// other mem dependency tracking logic: TS and signalWrite()
for (uint i = 0; i < signature.numMemories(); ++i) {
amd::Memory* mem = memories[i];
if (mem != nullptr) {
// Synchronize data with other memory instances if necessary
dev().getGpuMemory(mem)->syncCacheFromHost(*this);
}
}
}
for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const amd::KernelParameterDescriptor::InfoData& info = desc.info_;
// Find if current argument is a buffer
if (desc.type_ == T_POINTER) {
// If it is a local pointer
if (desc.size_ == 0) {
ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
// Save the original LDS size
size_t ldsSize = *reinterpret_cast<const size_t*>(params + desc.offset_);
// Patch the LDS address in the original arguments with an LDS address(offset)
WriteAqlArgAt(const_cast<address>(params), &ldsAddress, sizeof(void*), desc.offset_);
// Add the original size
ldsAddress += ldsSize;
} else {
Memory* gpuMem = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = info.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
if (nullptr != gpuMem) {
mem = gpuMem->owner();
}
} else {
mem = memories[index];
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
}
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
addBarrier();
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
} else if (gpuMem != nullptr) {
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, info.readOnly_);
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
constexpr bool WaitOnBusyEngine = true;
gpuMem->wait(*this, WaitOnBusyEngine);
addVmMemory(gpuMem);
//! Check if compiler expects read/write.
//! Note: SVM with subbuffers has an issue with tracking.
//! Conformance can send read only subbuffer, but update the region
//! in the kernel.
if ((mem != nullptr) &&
((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
mem->signalWrite(&dev());
}
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (gpuMem->memoryType() == Resource::ImageView) {
// Copy the current image SRD into CB1
uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
addVmMemory(cb(1)->ActiveMemory());
} else {
srdResource = true;
}
if (gpuMem->desc().isDoppTexture_) {
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
}
}
}
}
}
else if (desc.type_ == T_VOID) {
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
// Copy the current structure into CB1
size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(params, desc.size_));
// Then use a pointer in aqlArgBuffer to CB1
const auto it = hsaKernel.patch().find(desc.offset_);
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
addVmMemory(cb(1)->ActiveMemory());
}
}
else if (desc.type_ == T_SAMPLER) {
srdResource = true;
} else if (desc.type_ == T_QUEUE) {
uint32_t index = desc.info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
params + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return false;
}
vmQueue = vQueue()->vmAddress();
}
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), &vmQueue, sizeof(vmQueue), desc.offset_);
break;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return false;
}
if (srdResource || hsaKernel.prog().isStaticSampler()) {
dev().srds().fillResourceList(*this);
}
addVmMemory(&hsaKernel.prog().codeSegGpu());
for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
const static bool IsReadOnly = false;
// Validate global store for a dependency in the queue
memoryDependency().validate(*this, mem, IsReadOnly);
addVmMemory(mem);
}
return true;
@@ -378,6 +378,9 @@ class VirtualGPU : public device::VirtualDevice {
//! Return xfer buffer for staging operations
XferBuffer& xferWrite() { return writeBuffer_; }
//! Return managed buffer for staging operations
ManagedBuffer& managedBuffer() { return managedBuffer_; }
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
@@ -529,7 +532,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem //!< Native memory objects
bool nativeMem, //!< Native memory objects
size_t& ldsAddess //!< Returns LDS size, used in the kernel
);
//! Common function for fill memory used by both svm Fill and non-svm fill
@@ -644,4 +648,33 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
return id;
}
template <typename T>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
memcpy(dst + offset, src, size);
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
}
/*@}*/} // namespace pal
@@ -581,7 +581,8 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
params.push_back(desc);
}
createSignature(params);
device::Kernel::parameters_t hiddenParams;
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
#endif // defined(WITH_COMPILER_LIB)
@@ -660,8 +661,8 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
params.push_back(desc);
}
createSignature(params);
device::Kernel::parameters_t hiddenParams;
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
#endif // defined(WITH_LIGHTNING_COMPILER)
@@ -243,13 +243,17 @@ void KernelParameters::release(address mem, const amd::Device& device) const {
}
KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& params,
const std::string& attrib)
const std::string& attrib,
const std::vector<KernelParameterDescriptor>& hiddenParams,
uint32_t version)
: params_(params)
, hiddenParams_(hiddenParams)
, attributes_(attrib)
, paramsSize_(0)
, numMemories_(0)
, numSamplers_(0)
, numQueues_(0) {
, numQueues_(0)
, version_(version) {
size_t maxOffset = 0;
size_t last = 0;
// Find the last entry
@@ -283,7 +287,15 @@ KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& p
if (lastSize == 0 /* local mem */) {
lastSize = sizeof(cl_mem);
}
paramsSize_ = params[last].offset_ + alignUp(lastSize, sizeof(intptr_t));
// Note: It's a special case. HW ABI expects 64 bit for SRD, regardless of the binary.
// Force the size to 64 bit for those cases.
if ((params[last].info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
(params[last].info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
(params[last].info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
lastSize = alignUp(lastSize, sizeof(uint64_t));
}
paramsSize_ = params[last].offset_ + lastSize;
paramsSize_ = alignUp(paramsSize_, sizeof(intptr_t));
}
}
} // namespace amd
@@ -36,18 +36,30 @@ class Program;
class KernelSignature : public HeapObject {
private:
std::vector<KernelParameterDescriptor> params_;
std::vector<KernelParameterDescriptor> hiddenParams_;
std::string attributes_; //!< The kernel attributes
uint32_t paramsSize_;
uint32_t numMemories_;
uint32_t numSamplers_;
uint32_t numQueues_;
uint32_t version_;
public:
enum {
ABIVersion_0 = 0, //! ABI constructed based on the OCL semantics
ABIVersion_1 = 1 //! ABI constructed based on the HW ABI returned from the compiler
};
//! Default constructor
KernelSignature() : paramsSize_(0), numMemories_(0), numSamplers_(0), numQueues_(0) {}
KernelSignature():
paramsSize_(0), numMemories_(0), numSamplers_(0),
numQueues_(0), version_(ABIVersion_0) {}
//! Construct a new signature.
KernelSignature(const std::vector<KernelParameterDescriptor>& params, const std::string& attrib);
KernelSignature(const std::vector<KernelParameterDescriptor>& params,
const std::string& attrib,
const std::vector<KernelParameterDescriptor>& hiddenParams,
uint32_t version);
//! Return the number of parameters
size_t numParameters() const { return params_.size(); }
@@ -72,8 +84,17 @@ class KernelSignature : public HeapObject {
//! Returns the number of queue objects.
uint32_t numQueues() const { return numQueues_; }
//! Returns the signature version
uint32_t version() const { return version_; }
//! Return the kernel attributes
const std::string& attributes() const { return attributes_; }
const std::vector<KernelParameterDescriptor>& hiddenParameters() const
{ return hiddenParams_; }
const std::vector<KernelParameterDescriptor>& parameters() const
{ return params_; }
};
// @todo: look into a copy-on-write model instead of copy-on-read.
@@ -604,8 +604,8 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse
}
bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func) {
// FIXME_lmoriche: check that the signatures are compatible
if (deviceKernels_.size() == 0) {
if (deviceKernels_.size() == 0 ||
(func->signature().version() > KernelSignature::ABIVersion_0)) {
signature_ = func->signature();
}
deviceKernels_[&device] = func;