db04b3b295
SWDEV-79445 - OCL generic changes and code clean-up - Prepare the changes to remove resource rename feature. Resource rename can substitute a low level memory object in the current resource with a different one in order to avoid GPU stalls on frequent CPU updates, like constant buffers or staging buffers. Renaming was necessary due to CAL and old HW limitations to manage suballocations and has some inefficiency with tiny updates and staging buffers. - This change removes renames usage from the constant buffer management Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#45 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#78 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#43 edit
1583 wiersze
52 KiB
C++
1583 wiersze
52 KiB
C++
//
|
|
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
#include "device/pal/palkernel.hpp"
|
|
#include "device/pal/palprogram.hpp"
|
|
#include "device/pal/palblit.hpp"
|
|
#include "device/pal/palconstbuf.hpp"
|
|
#include "device/pal/palsched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
#include <algorithm>
|
|
|
|
namespace pal {
|
|
|
|
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
|
|
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
|
|
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
|
} else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
|
} else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
|
} else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
|
} else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
|
} else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
|
}
|
|
return HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
}
|
|
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_QUEUE:
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
case ARG_TYPE_VALUE:
|
|
return (argInfo->arg.value.data == DATATYPE_struct) ? HSAIL_ARGTYPE_REFERENCE
|
|
: HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1;
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2;
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8;
|
|
case DATATYPE_struct:
|
|
return 128;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER:
|
|
return sizeof(cl_sampler);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgPointeeAlignment(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
return argInfo->arg.pointer.align;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
|
|
aclAccessType accessType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
accessType = argInfo->arg.pointer.type;
|
|
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
accessType = argInfo->arg.image.type;
|
|
} else {
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
if (accessType == ACCESS_TYPE_RO) {
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
} else if (accessType == ACCESS_TYPE_WO) {
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
}
|
|
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
return HSAIL_ADDRESS_CONSTANT;
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
} else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * argInfo->arg.value.numElements;
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * argInfo->arg.value.numElements;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
|
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
|
|
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
|
|
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
|
|
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
|
|
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
|
|
};
|
|
|
|
uint sizeType;
|
|
uint numElements;
|
|
if (arg->type_ == HSAIL_ARGTYPE_QUEUE) {
|
|
return T_QUEUE;
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
return T_POINTER;
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_VALUE || arg->type_ == HSAIL_ARGTYPE_REFERENCE) {
|
|
switch (arg->dataType_) {
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
sizeType = 0;
|
|
numElements = arg->size_;
|
|
break;
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_U16:
|
|
sizeType = 1;
|
|
numElements = arg->size_ / 2;
|
|
break;
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_U32:
|
|
sizeType = 2;
|
|
numElements = arg->size_ / 4;
|
|
break;
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_U64:
|
|
sizeType = 3;
|
|
numElements = arg->size_ / 8;
|
|
break;
|
|
case HSAIL_DATATYPE_F16:
|
|
sizeType = 4;
|
|
numElements = arg->size_ / 2;
|
|
break;
|
|
case HSAIL_DATATYPE_F32:
|
|
sizeType = 4;
|
|
numElements = arg->size_ / 4;
|
|
break;
|
|
case HSAIL_DATATYPE_F64:
|
|
sizeType = 5;
|
|
numElements = arg->size_ / 8;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
|
|
switch (numElements) {
|
|
case 1:
|
|
return ClkValueMapType[sizeType][0];
|
|
case 2:
|
|
return ClkValueMapType[sizeType][1];
|
|
case 3:
|
|
return ClkValueMapType[sizeType][2];
|
|
case 4:
|
|
return ClkValueMapType[sizeType][3];
|
|
case 8:
|
|
return ClkValueMapType[sizeType][4];
|
|
case 16:
|
|
return ClkValueMapType[sizeType][5];
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
} else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const HSAILKernel::Argument* arg) {
|
|
if (arg->type_ == HSAIL_ARGTYPE_POINTER) {
|
|
switch (arg->addrQual_) {
|
|
case HSAIL_ADDRESS_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case HSAIL_ADDRESS_CONSTANT:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case HSAIL_ADDRESS_LOCAL:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
// default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const HSAILKernel::Argument* arg) {
|
|
if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
switch (arg->access_) {
|
|
case HSAIL_ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case HSAIL_ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case HSAIL_ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
|
if (!sym) {
|
|
return false;
|
|
}
|
|
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
|
return false;
|
|
}
|
|
|
|
amd_kernel_code_t* akc =
|
|
reinterpret_cast<amd_kernel_code_t*>(prog().findHostKernelAddress(code_));
|
|
cpuAqlCode_ = akc;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
|
reinterpret_cast<void*>(&codeSize_))) {
|
|
return false;
|
|
}
|
|
size_t akc_align = 0;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
|
|
reinterpret_cast<void*>(&akc_align))) {
|
|
return false;
|
|
}
|
|
|
|
assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned");
|
|
workGroupInfo_.scratchRegs_ =
|
|
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
|
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
|
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
|
|
akc->workgroup_group_segment_byte_size;
|
|
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
|
workGroupInfo_.usedStackSize_ = 0;
|
|
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
|
|
|
if (!prog().isNull()) {
|
|
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
|
|
workGroupInfo_.availableSGPRs_ =
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
|
|
workGroupInfo_.availableVGPRs_ =
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
|
|
dev().info().wavefrontWidth_;
|
|
} else {
|
|
workGroupInfo_.availableLDSSize_ = 64 * Ki;
|
|
workGroupInfo_.availableSGPRs_ = 104;
|
|
workGroupInfo_.availableVGPRs_ = 256;
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
// skip the hidden arguments
|
|
if (arguments_[i]->index_ == uint(-1)) continue;
|
|
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(arguments_[i]);
|
|
desc.addressQualifier_ = GetOclAddrQual(arguments_[i]);
|
|
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
} else {
|
|
desc.size_ = arguments_[i]->size_;
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
|
// Iterate through the each kernel argument
|
|
for (uint index = 0; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
arg->access_ = GetHSAILArgAccessType(aclArg);
|
|
arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg);
|
|
|
|
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
|
|
arg->index_ = isHidden ? uint(-1) : index++;
|
|
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) {
|
|
PrintfInfo info;
|
|
uint index = 0;
|
|
for (; aclPrintf->struct_size != 0; aclPrintf++) {
|
|
index = aclPrintf->ID;
|
|
if (printf_.size() <= index) {
|
|
printf_.resize(index + 1);
|
|
}
|
|
std::string pfmt = aclPrintf->fmtStr;
|
|
info.fmtString_.clear();
|
|
bool need_nl = true;
|
|
for (size_t pos = 0; pos < pfmt.size(); ++pos) {
|
|
char symbol = pfmt[pos];
|
|
need_nl = true;
|
|
if (symbol == '\\') {
|
|
// Rest of the C escape sequences (e.g. \') are handled correctly
|
|
// by the MDParser, we are not sure exactly how!
|
|
switch (pfmt[pos + 1]) {
|
|
case 'a':
|
|
pos++;
|
|
symbol = '\a';
|
|
break;
|
|
case 'b':
|
|
pos++;
|
|
symbol = '\b';
|
|
break;
|
|
case 'f':
|
|
pos++;
|
|
symbol = '\f';
|
|
break;
|
|
case 'n':
|
|
pos++;
|
|
symbol = '\n';
|
|
need_nl = false;
|
|
break;
|
|
case 'r':
|
|
pos++;
|
|
symbol = '\r';
|
|
break;
|
|
case 'v':
|
|
pos++;
|
|
symbol = '\v';
|
|
break;
|
|
case '7':
|
|
if (pfmt[pos + 2] == '2') {
|
|
pos += 2;
|
|
symbol = '\72';
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
info.fmtString_.push_back(symbol);
|
|
}
|
|
if (need_nl) {
|
|
info.fmtString_ += "\n";
|
|
}
|
|
uint32_t* tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
|
|
for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) {
|
|
info.arguments_.push_back(*tmp_ptr);
|
|
}
|
|
printf_[index] = info;
|
|
info.arguments_.clear();
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
|
|
: device::Kernel(name),
|
|
compileOptions_(compileOptions),
|
|
dev_(prog->dev()),
|
|
prog_(*prog),
|
|
index_(0),
|
|
code_(0),
|
|
codeSize_(0),
|
|
waveLimiter_(
|
|
this,
|
|
(prog->isNull() ? 1
|
|
: dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
|
|
dev().hwInfo()->simdPerCU_) {
|
|
hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel() {
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
}
|
|
|
|
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
acl_error error = ACL_SUCCESS;
|
|
std::string openClKernelName = openclMangledName(name());
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
// compile kernel down to ISA
|
|
if (finalize) {
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
options.append(" -sc-xnack-iommu");
|
|
}
|
|
error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
|
ACL_TYPE_ISA, nullptr);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize kernel");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
aqlCreateHWInfo(sym);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), nullptr, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (nullptr == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete[] aclArgList;
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
} else {
|
|
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (nullptr == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete[] aclPrintfList;
|
|
}
|
|
|
|
aclMetadata md;
|
|
md.enqueue_kernel = false;
|
|
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
|
|
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = md.enqueue_kernel;
|
|
|
|
md.kernel_index = -1;
|
|
size_t sizeOfIndex = sizeof(md.kernel_index);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
|
|
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = md.kernel_index;
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
|
|
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
|
|
&sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable();
|
|
|
|
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
|
&sizeOfWorkGroupSizeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfVecTypeHint;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
if (0 != sizeOfVecTypeHint) {
|
|
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
|
|
if (NULL == VecTypeHint) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
VecTypeHint[sizeOfVecTypeHint] = '\0';
|
|
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
|
|
delete[] VecTypeHint;
|
|
}
|
|
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
return true;
|
|
}
|
|
|
|
bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const {
|
|
// HSAIL path doesn't require memory reallocations
|
|
return true;
|
|
}
|
|
|
|
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
|
|
|
|
const HSAILProgram& HSAILKernel::prog() const {
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const {
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
size_t thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if (flags_.imageEna_ &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) && ((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
} else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
} else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--)
|
|
;
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Assuming DWORD access
|
|
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
|
// or size is too small for the cache line
|
|
(lclWorkSize[0] < cacheLineMatch))) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Use X dimension as high priority. Runtime will assume that
|
|
// X dimension is more important for the address calculation
|
|
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
|
lclWorkSize[0] = cacheLineMatch;
|
|
thrPerGrp /= cacheLineMatch;
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 1; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
} else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
} else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
} else {
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void WriteAqlArg(
|
|
unsigned char** dst, //!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
} else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
|
(1 << HSA_PACKET_HEADER_BARRIER) |
|
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
|
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
|
|
|
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
|
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ManagedBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
gpu.addVmMemory(cb->activeMemory());
|
|
}
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
// Find all parameters for the current kernel
|
|
for (auto arg : arguments_) {
|
|
const_address paramaddr = nullptr;
|
|
if (arg->index_ != uint(-1)) {
|
|
paramaddr = parameters + signature.at(arg->index_).offset_;
|
|
}
|
|
|
|
// Handle the hidden arguments first, as they do not have a
|
|
// matching parameter in the OCL signature (not a valid arg->index_)
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
|
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
|
|
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
|
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
|
|
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
|
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
|
|
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
|
size_t bufferPtr = 0;
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
|
|
// and set the fourth argument as the printf_buffer pointer
|
|
bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
|
assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
|
assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_HIDDEN_NONE: {
|
|
void* zero = 0;
|
|
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_POINTER: {
|
|
// If it is a local pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
|
|
ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
|
|
break;
|
|
}
|
|
assert(
|
|
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
|
|
"Unsupported address qualifier");
|
|
|
|
// If it is a global pointer
|
|
Memory* gpuMem = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
|
|
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
|
|
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
// If finegrainsystem is present then the pointer can be malloced by the app and
|
|
// passed to kernel directly. If so copy the pointer location to aqlArgBuf
|
|
else if (!dev().isFineGrainedSystem(true)) {
|
|
return nullptr;
|
|
}
|
|
break;
|
|
}
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
|
|
if (nullptr != gpuMem) {
|
|
mem = gpuMem->owner();
|
|
}
|
|
} else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
if (gpuMem == nullptr) {
|
|
WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
|
|
//! 64 bit isn't supported with 32 bit binary
|
|
uint64_t globalAddress = gpuMem->vmAddress();
|
|
WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_);
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
|
|
// save the memory object pointer to allow global memory access
|
|
if (nullptr != dev().hwDebugMgr()) {
|
|
dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner());
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_REFERENCE: {
|
|
// Copy the current structure into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ManagedBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
size_t gpuPtr = static_cast<size_t>(cb->vmAddress() + cb->wrtOffset());
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t));
|
|
gpu.addVmMemory(cb->activeMemory());
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
if (nativeMem) {
|
|
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
|
|
} else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem == nullptr) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return nullptr;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! \note Special case for the image views.
|
|
//! Copy SRD to CB1, so blit manager will be able to release
|
|
//! this view without a wait for SRD resource.
|
|
if (image->memoryType() == Resource::ImageView) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
|
|
ManagedBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(HsaImageObjectSize);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
gpu.addVmMemory(cb->activeMemory());
|
|
} else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
gpu.addVmMemory(image);
|
|
if (image->desc().isDoppTexture_) {
|
|
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
|
|
kernel.parameters().getExecPfpaVcop());
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
const amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(paramaddr);
|
|
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
const amd::DeviceQueue* queue = *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue;
|
|
if (dev().settings().useDeviceQueue_) {
|
|
vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
} else {
|
|
if (!gpu.createVirtualQueue(queue->size())) {
|
|
LogError("Virtual queue creation failed!");
|
|
return nullptr;
|
|
}
|
|
vmQueue = gpu.vQueue()->vmAddress();
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported argument type ");
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return nullptr;
|
|
}
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
// Check there is no arguments' buffer overflow. We may not use all the
|
|
// hidden argument slots.
|
|
assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize()));
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
// HSAIL kernarg segment size is rounded up to multiple of 16.
|
|
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
hsa_kernel_dispatch_packet_t* hsaDisp =
|
|
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->sysMemCopy() + argsBufferSize());
|
|
|
|
amd::NDRange local(sizes.local());
|
|
const amd::NDRange& global = sizes.global();
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
|
|
|
hsaDisp->header = kDispatchPacketHeader;
|
|
hsaDisp->setup = sizes.dimensions();
|
|
|
|
hsaDisp->workgroup_size_x = local[0];
|
|
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
|
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
|
|
|
hsaDisp->grid_size_x = global[0];
|
|
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
|
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
hsaDisp->private_segment_size = spillSegSize();
|
|
hsaDisp->group_segment_size = ldsAddress - ldsSize();
|
|
hsaDisp->kernel_object = gpuAqlCode();
|
|
|
|
ManagedBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
|
|
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
|
hsaDisp->reserved2 = 0;
|
|
hsaDisp->completion_signal.handle = 0;
|
|
|
|
gpu.addVmMemory(cb->activeMemory());
|
|
gpu.addVmMemory(&prog().codeSegGpu());
|
|
for (pal::Memory* mem : prog().globalStores()) {
|
|
gpu.addVmMemory(mem);
|
|
}
|
|
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
|
|
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
|
gpu.addVmMemory(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource || prog().isStaticSampler()) {
|
|
dev().srds().fillResourceList(gpu);
|
|
}
|
|
|
|
return hsaDisp;
|
|
}
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
using llvm::AMDGPU::HSAMD::AccessQualifier;
|
|
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
|
using llvm::AMDGPU::HSAMD::ValueKind;
|
|
using llvm::AMDGPU::HSAMD::ValueType;
|
|
|
|
const LightningProgram& LightningKernel::prog() const {
|
|
return reinterpret_cast<const LightningProgram&>(prog_);
|
|
}
|
|
|
|
void LightningKernel::initPrintf(const std::vector<std::string>& printfInfoStrings) {
|
|
for (auto str : printfInfoStrings) {
|
|
std::vector<std::string> tokens;
|
|
|
|
size_t end, pos = 0;
|
|
do {
|
|
end = str.find_first_of(':', pos);
|
|
tokens.push_back(str.substr(pos, end - pos));
|
|
pos = end + 1;
|
|
} while (end != std::string::npos);
|
|
|
|
if (tokens.size() < 2) {
|
|
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
|
|
continue;
|
|
}
|
|
|
|
pos = 0;
|
|
size_t printfInfoID = std::stoi(tokens[pos++]);
|
|
if (printf_.size() <= printfInfoID) {
|
|
printf_.resize(printfInfoID + 1);
|
|
}
|
|
PrintfInfo& info = printf_[printfInfoID];
|
|
|
|
size_t numSizes = std::stoi(tokens[pos++]);
|
|
end = pos + numSizes;
|
|
|
|
// ensure that we have the correct number of tokens
|
|
if (tokens.size() < end + 1 /*last token is the fmtString*/) {
|
|
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
|
|
continue;
|
|
}
|
|
|
|
// push the argument sizes
|
|
while (pos < end) {
|
|
info.arguments_.push_back(std::stoi(tokens[pos++]));
|
|
}
|
|
|
|
// FIXME: We should not need this! [
|
|
std::string& fmt = tokens[pos];
|
|
bool need_nl = true;
|
|
|
|
for (pos = 0; pos < fmt.size(); ++pos) {
|
|
char symbol = fmt[pos];
|
|
need_nl = true;
|
|
if (symbol == '\\') {
|
|
switch (fmt[pos + 1]) {
|
|
case 'a':
|
|
pos++;
|
|
symbol = '\a';
|
|
break;
|
|
case 'b':
|
|
pos++;
|
|
symbol = '\b';
|
|
break;
|
|
case 'f':
|
|
pos++;
|
|
symbol = '\f';
|
|
break;
|
|
case 'n':
|
|
pos++;
|
|
symbol = '\n';
|
|
need_nl = false;
|
|
break;
|
|
case 'r':
|
|
pos++;
|
|
symbol = '\r';
|
|
break;
|
|
case 'v':
|
|
pos++;
|
|
symbol = '\v';
|
|
break;
|
|
case '7':
|
|
if (fmt[pos + 2] == '2') {
|
|
pos += 2;
|
|
symbol = '\72';
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
info.fmtString_.push_back(symbol);
|
|
}
|
|
if (need_nl) {
|
|
info.fmtString_ += "\n";
|
|
}
|
|
// ]
|
|
}
|
|
}
|
|
|
|
static inline HSAIL_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) {
|
|
switch (lcArg.mValueKind) {
|
|
case ValueKind::GlobalBuffer:
|
|
case ValueKind::DynamicSharedPointer:
|
|
case ValueKind::Pipe:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ValueKind::ByValue:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ValueKind::Image:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ValueKind::Sampler:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ValueKind::HiddenGlobalOffsetX:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
|
case ValueKind::HiddenGlobalOffsetY:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
|
case ValueKind::HiddenGlobalOffsetZ:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
|
case ValueKind::HiddenPrintfBuffer:
|
|
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
|
case ValueKind::HiddenDefaultQueue:
|
|
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
|
case ValueKind::HiddenCompletionAction:
|
|
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
|
case ValueKind::HiddenNone:
|
|
return HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
|
|
|
static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
uint32_t align = lcArg.mPointeeAlign;
|
|
if (align == 0) {
|
|
LogWarning("Missing DynamicSharedPointer alignment");
|
|
align = 128; /* worst case alignment */
|
|
;
|
|
}
|
|
return align;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static inline HSAIL_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) {
|
|
switch (lcArg.mAccQual) {
|
|
case AccessQualifier::ReadOnly:
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
case AccessQualifier::WriteOnly:
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
case AccessQualifier::ReadWrite:
|
|
default:
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
}
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
|
|
static inline HSAIL_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
} else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
|
|
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
|
|
return HSAIL_ADDRESS_CONSTANT;
|
|
}
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
} else if (lcArg.mValueKind == ValueKind::Image ||
|
|
lcArg.mValueKind == ValueKind::Sampler ||
|
|
lcArg.mValueKind == ValueKind::Pipe) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
static inline HSAIL_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind != ValueKind::ByValue) {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
|
|
switch (lcArg.mValueType) {
|
|
case ValueType::I8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case ValueType::I16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case ValueType::I32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case ValueType::I64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case ValueType::U8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case ValueType::U16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case ValueType::U32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case ValueType::U64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case ValueType::F16:
|
|
return HSAIL_DATATYPE_F16;
|
|
case ValueType::F32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case ValueType::F64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case ValueType::Struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
|
|
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
if (lcArg.mIsVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (lcArg.mIsRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (lcArg.mIsConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
}
|
|
else if (lcArg.mIsPipe) {
|
|
assert(lcArg.mValueKind == ValueKind::Pipe);
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
|
device::Kernel::parameters_t params;
|
|
|
|
size_t offset = 0;
|
|
|
|
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
|
|
const KernelArgMD& lcArg = kernelMD.mArgs[i];
|
|
|
|
// Initialize HSAIL kernel argument
|
|
auto arg = new HSAILKernel::Argument;
|
|
arg->name_ = lcArg.mName;
|
|
arg->typeName_ = lcArg.mTypeName;
|
|
arg->size_ = lcArg.mSize;
|
|
arg->type_ = GetKernelArgType(lcArg);
|
|
arg->addrQual_ = GetKernelAddrQual(lcArg);
|
|
arg->dataType_ = GetKernelDataType(lcArg);
|
|
arg->alignment_ = GetKernelArgAlignment(lcArg);
|
|
arg->access_ = GetKernelArgAccessType(lcArg);
|
|
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
|
|
|
|
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
|
|
arg->index_ = isHidden ? uint(-1) : params.size();
|
|
arguments_.push_back(arg);
|
|
|
|
if (isHidden) {
|
|
continue;
|
|
}
|
|
|
|
// Initialize Device kernel parameters
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
desc.name_ = lcArg.mName.c_str();
|
|
desc.type_ = GetOclType(arg);
|
|
desc.addressQualifier_ = GetOclAddrQual(arg);
|
|
desc.accessQualifier_ = GetOclAccessQual(arg);
|
|
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
|
desc.typeName_ = lcArg.mTypeName.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
} else {
|
|
desc.size_ = arg->size_;
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the parameters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
|
|
params.push_back(desc);
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
|
|
for (const KernelMD& kernelMD : programMD->mKernels) {
|
|
if (kernelMD.mName == name) {
|
|
return &kernelMD;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
aqlCreateHWInfo(symbol);
|
|
|
|
const CodeObjectMD* programMD = prog().metadata();
|
|
assert(programMD != nullptr);
|
|
|
|
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
|
|
|
|
if (kernelMD == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Set the argList
|
|
initArgList(*kernelMD);
|
|
|
|
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
|
|
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
|
|
workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0];
|
|
workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1];
|
|
workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2];
|
|
}
|
|
|
|
if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) {
|
|
const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint;
|
|
workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0];
|
|
workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1];
|
|
workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2];
|
|
}
|
|
|
|
if (!kernelMD->mAttrs.mVecTypeHint.empty()) {
|
|
workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
|
|
|
workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
|
|
if (workGroupInfo_.size_ == 0) {
|
|
return false;
|
|
}
|
|
|
|
initPrintf(programMD->mPrintf);
|
|
|
|
/*FIXME_lmoriche:
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
|
|
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
|
|
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable();
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
#endif // defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
|
|
} // namespace pal
|