953d3c86e0
SWDEV-79445 - OCL generic changes and code clean-up - Remove pinOffset_ field, since the pinning offset can be combined with global offset_ field Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#50 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#16 edit
1583 lines
52 KiB
C++
1583 lines
52 KiB
C++
//
|
|
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
#include "device/pal/palkernel.hpp"
|
|
#include "device/pal/palprogram.hpp"
|
|
#include "device/pal/palblit.hpp"
|
|
#include "device/pal/palconstbuf.hpp"
|
|
#include "device/pal/palsched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
#include <algorithm>
|
|
|
|
namespace pal {
|
|
|
|
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
|
|
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
|
|
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
|
} else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
|
} else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
|
} else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
|
} else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
|
} else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
|
|
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
|
}
|
|
return HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
}
|
|
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_QUEUE:
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
case ARG_TYPE_VALUE:
|
|
return (argInfo->arg.value.data == DATATYPE_struct) ? HSAIL_ARGTYPE_REFERENCE
|
|
: HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1;
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2;
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8;
|
|
case DATATYPE_struct:
|
|
return 128;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER:
|
|
return sizeof(cl_sampler);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgPointeeAlignment(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
return argInfo->arg.pointer.align;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
|
|
aclAccessType accessType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
accessType = argInfo->arg.pointer.type;
|
|
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
accessType = argInfo->arg.image.type;
|
|
} else {
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
if (accessType == ACCESS_TYPE_RO) {
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
} else if (accessType == ACCESS_TYPE_WO) {
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
}
|
|
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
return HSAIL_ADDRESS_CONSTANT;
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
} else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * argInfo->arg.value.numElements;
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * argInfo->arg.value.numElements;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
|
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
|
|
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
|
|
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
|
|
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
|
|
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
|
|
};
|
|
|
|
uint sizeType;
|
|
uint numElements;
|
|
if (arg->type_ == HSAIL_ARGTYPE_QUEUE) {
|
|
return T_QUEUE;
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
return T_POINTER;
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_VALUE || arg->type_ == HSAIL_ARGTYPE_REFERENCE) {
|
|
switch (arg->dataType_) {
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
sizeType = 0;
|
|
numElements = arg->size_;
|
|
break;
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_U16:
|
|
sizeType = 1;
|
|
numElements = arg->size_ / 2;
|
|
break;
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_U32:
|
|
sizeType = 2;
|
|
numElements = arg->size_ / 4;
|
|
break;
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_U64:
|
|
sizeType = 3;
|
|
numElements = arg->size_ / 8;
|
|
break;
|
|
case HSAIL_DATATYPE_F16:
|
|
sizeType = 4;
|
|
numElements = arg->size_ / 2;
|
|
break;
|
|
case HSAIL_DATATYPE_F32:
|
|
sizeType = 4;
|
|
numElements = arg->size_ / 4;
|
|
break;
|
|
case HSAIL_DATATYPE_F64:
|
|
sizeType = 5;
|
|
numElements = arg->size_ / 8;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
|
|
switch (numElements) {
|
|
case 1:
|
|
return ClkValueMapType[sizeType][0];
|
|
case 2:
|
|
return ClkValueMapType[sizeType][1];
|
|
case 3:
|
|
return ClkValueMapType[sizeType][2];
|
|
case 4:
|
|
return ClkValueMapType[sizeType][3];
|
|
case 8:
|
|
return ClkValueMapType[sizeType][4];
|
|
case 16:
|
|
return ClkValueMapType[sizeType][5];
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
} else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const HSAILKernel::Argument* arg) {
|
|
if (arg->type_ == HSAIL_ARGTYPE_POINTER) {
|
|
switch (arg->addrQual_) {
|
|
case HSAIL_ADDRESS_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case HSAIL_ADDRESS_CONSTANT:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case HSAIL_ADDRESS_LOCAL:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
} else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
// default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const HSAILKernel::Argument* arg) {
|
|
if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
switch (arg->access_) {
|
|
case HSAIL_ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case HSAIL_ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case HSAIL_ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
|
if (!sym) {
|
|
return false;
|
|
}
|
|
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
|
return false;
|
|
}
|
|
|
|
amd_kernel_code_t* akc =
|
|
reinterpret_cast<amd_kernel_code_t*>(prog().findHostKernelAddress(code_));
|
|
cpuAqlCode_ = akc;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
|
reinterpret_cast<void*>(&codeSize_))) {
|
|
return false;
|
|
}
|
|
size_t akc_align = 0;
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
|
|
reinterpret_cast<void*>(&akc_align))) {
|
|
return false;
|
|
}
|
|
|
|
assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned");
|
|
workGroupInfo_.scratchRegs_ =
|
|
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
|
|
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
|
|
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
|
|
akc->workgroup_group_segment_byte_size;
|
|
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
|
|
workGroupInfo_.usedStackSize_ = 0;
|
|
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
|
|
|
|
if (!prog().isNull()) {
|
|
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
|
|
workGroupInfo_.availableSGPRs_ =
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
|
|
workGroupInfo_.availableVGPRs_ =
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
|
|
dev().info().wavefrontWidth_;
|
|
} else {
|
|
workGroupInfo_.availableLDSSize_ = 64 * Ki;
|
|
workGroupInfo_.availableSGPRs_ = 104;
|
|
workGroupInfo_.availableVGPRs_ = 256;
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
// skip the hidden arguments
|
|
if (arguments_[i]->index_ == uint(-1)) continue;
|
|
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(arguments_[i]);
|
|
desc.addressQualifier_ = GetOclAddrQual(arguments_[i]);
|
|
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
} else {
|
|
desc.size_ = arguments_[i]->size_;
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
|
// Iterate through the each kernel argument
|
|
for (uint index = 0; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
arg->access_ = GetHSAILArgAccessType(aclArg);
|
|
arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg);
|
|
|
|
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
|
|
arg->index_ = isHidden ? uint(-1) : index++;
|
|
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) {
|
|
PrintfInfo info;
|
|
uint index = 0;
|
|
for (; aclPrintf->struct_size != 0; aclPrintf++) {
|
|
index = aclPrintf->ID;
|
|
if (printf_.size() <= index) {
|
|
printf_.resize(index + 1);
|
|
}
|
|
std::string pfmt = aclPrintf->fmtStr;
|
|
info.fmtString_.clear();
|
|
bool need_nl = true;
|
|
for (size_t pos = 0; pos < pfmt.size(); ++pos) {
|
|
char symbol = pfmt[pos];
|
|
need_nl = true;
|
|
if (symbol == '\\') {
|
|
// Rest of the C escape sequences (e.g. \') are handled correctly
|
|
// by the MDParser, we are not sure exactly how!
|
|
switch (pfmt[pos + 1]) {
|
|
case 'a':
|
|
pos++;
|
|
symbol = '\a';
|
|
break;
|
|
case 'b':
|
|
pos++;
|
|
symbol = '\b';
|
|
break;
|
|
case 'f':
|
|
pos++;
|
|
symbol = '\f';
|
|
break;
|
|
case 'n':
|
|
pos++;
|
|
symbol = '\n';
|
|
need_nl = false;
|
|
break;
|
|
case 'r':
|
|
pos++;
|
|
symbol = '\r';
|
|
break;
|
|
case 'v':
|
|
pos++;
|
|
symbol = '\v';
|
|
break;
|
|
case '7':
|
|
if (pfmt[pos + 2] == '2') {
|
|
pos += 2;
|
|
symbol = '\72';
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
info.fmtString_.push_back(symbol);
|
|
}
|
|
if (need_nl) {
|
|
info.fmtString_ += "\n";
|
|
}
|
|
uint32_t* tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
|
|
for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) {
|
|
info.arguments_.push_back(*tmp_ptr);
|
|
}
|
|
printf_[index] = info;
|
|
info.arguments_.clear();
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
|
|
: device::Kernel(name),
|
|
compileOptions_(compileOptions),
|
|
dev_(prog->dev()),
|
|
prog_(*prog),
|
|
index_(0),
|
|
code_(0),
|
|
codeSize_(0),
|
|
waveLimiter_(
|
|
this,
|
|
(prog->isNull() ? 1
|
|
: dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
|
|
dev().hwInfo()->simdPerCU_) {
|
|
hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel() {
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
}
|
|
|
|
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
assert(!"Should not reach here");
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
acl_error error = ACL_SUCCESS;
|
|
std::string openClKernelName = openclMangledName(name());
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
// compile kernel down to ISA
|
|
if (finalize) {
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
options.append(" -sc-xnack-iommu");
|
|
}
|
|
error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
|
ACL_TYPE_ISA, nullptr);
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize kernel");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
aqlCreateHWInfo(sym);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), nullptr, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (nullptr == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete[] aclArgList;
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
} else {
|
|
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (nullptr == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete[] aclPrintfList;
|
|
}
|
|
|
|
aclMetadata md;
|
|
md.enqueue_kernel = false;
|
|
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
|
|
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = md.enqueue_kernel;
|
|
|
|
md.kernel_index = -1;
|
|
size_t sizeOfIndex = sizeof(md.kernel_index);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
|
|
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = md.kernel_index;
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
|
|
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
|
|
&sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable();
|
|
|
|
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
|
&sizeOfWorkGroupSizeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfVecTypeHint;
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
if (0 != sizeOfVecTypeHint) {
|
|
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
|
|
if (NULL == VecTypeHint) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
VecTypeHint[sizeOfVecTypeHint] = '\0';
|
|
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
|
|
delete[] VecTypeHint;
|
|
}
|
|
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
return true;
|
|
}
|
|
|
|
bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const {
|
|
// HSAIL path doesn't require memory reallocations
|
|
return true;
|
|
}
|
|
|
|
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
|
|
|
|
const HSAILProgram& HSAILKernel::prog() const {
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const {
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
size_t thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if (flags_.imageEna_ &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) && ((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
} else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
} else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--)
|
|
;
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Assuming DWORD access
|
|
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
|
// or size is too small for the cache line
|
|
(lclWorkSize[0] < cacheLineMatch))) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Use X dimension as high priority. Runtime will assume that
|
|
// X dimension is more important for the address calculation
|
|
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
|
lclWorkSize[0] = cacheLineMatch;
|
|
thrPerGrp /= cacheLineMatch;
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 1; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
} else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
} else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
} else {
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void WriteAqlArg(
|
|
unsigned char** dst, //!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
} else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
|
(1 << HSA_PACKET_HEADER_BARRIER) |
|
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
|
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
|
|
|
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
|
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
gpu.addVmMemory(cb);
|
|
}
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
// Find all parameters for the current kernel
|
|
for (auto arg : arguments_) {
|
|
const_address paramaddr = nullptr;
|
|
if (arg->index_ != uint(-1)) {
|
|
paramaddr = parameters + signature.at(arg->index_).offset_;
|
|
}
|
|
|
|
// Handle the hidden arguments first, as they do not have a
|
|
// matching parameter in the OCL signature (not a valid arg->index_)
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
|
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
|
|
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
|
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
|
|
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
|
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
|
|
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
|
size_t bufferPtr = 0;
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
|
|
// and set the fourth argument as the printf_buffer pointer
|
|
bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
|
assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
|
assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_HIDDEN_NONE: {
|
|
void* zero = 0;
|
|
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
|
WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_POINTER: {
|
|
// If it is a local pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
|
|
ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
|
|
break;
|
|
}
|
|
assert(
|
|
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
|
|
"Unsupported address qualifier");
|
|
|
|
// If it is a global pointer
|
|
Memory* gpuMem = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
|
|
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
|
|
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
// If finegrainsystem is present then the pointer can be malloced by the app and
|
|
// passed to kernel directly. If so copy the pointer location to aqlArgBuf
|
|
else if (!dev().isFineGrainedSystem(true)) {
|
|
return nullptr;
|
|
}
|
|
break;
|
|
}
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
|
|
if (nullptr != gpuMem) {
|
|
mem = gpuMem->owner();
|
|
}
|
|
} else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
if (gpuMem == nullptr) {
|
|
WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_);
|
|
break;
|
|
}
|
|
|
|
//! 64 bit isn't supported with 32 bit binary
|
|
uint64_t globalAddress = gpuMem->vmAddress();
|
|
WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_);
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
|
|
// save the memory object pointer to allow global memory access
|
|
if (nullptr != dev().hwDebugMgr()) {
|
|
dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner());
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_REFERENCE: {
|
|
// Copy the current structure into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
size_t gpuPtr = static_cast<size_t>(cb->vmAddress() + cb->wrtOffset());
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t));
|
|
gpu.addVmMemory(cb);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
if (nativeMem) {
|
|
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
|
|
} else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem == nullptr) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return nullptr;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! \note Special case for the image views.
|
|
//! Copy SRD to CB1, so blit manager will be able to release
|
|
//! this view without a wait for SRD resource.
|
|
if (image->memoryType() == Resource::ImageView) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(HsaImageObjectSize);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
gpu.addVmMemory(cb);
|
|
} else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
gpu.addVmMemory(image);
|
|
if (image->desc().isDoppTexture_) {
|
|
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
|
|
kernel.parameters().getExecPfpaVcop());
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
const amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(paramaddr);
|
|
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
const amd::DeviceQueue* queue = *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue;
|
|
if (dev().settings().useDeviceQueue_) {
|
|
vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
} else {
|
|
if (!gpu.createVirtualQueue(queue->size())) {
|
|
LogError("Virtual queue creation failed!");
|
|
return nullptr;
|
|
}
|
|
vmQueue = gpu.vQueue()->vmAddress();
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported argument type ");
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return nullptr;
|
|
}
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
// Check there is no arguments' buffer overflow. We may not use all the
|
|
// hidden argument slots.
|
|
assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize()));
|
|
#else // !defined(WITH_LIGHTNING_COMPILER)
|
|
// HSAIL kernarg segment size is rounded up to multiple of 16.
|
|
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
|
hsa_kernel_dispatch_packet_t* hsaDisp =
|
|
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->sysMemCopy() + argsBufferSize());
|
|
|
|
amd::NDRange local(sizes.local());
|
|
const amd::NDRange& global = sizes.global();
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
|
|
|
hsaDisp->header = kDispatchPacketHeader;
|
|
hsaDisp->setup = sizes.dimensions();
|
|
|
|
hsaDisp->workgroup_size_x = local[0];
|
|
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
|
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
|
|
|
hsaDisp->grid_size_x = global[0];
|
|
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
|
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
hsaDisp->private_segment_size = spillSegSize();
|
|
hsaDisp->group_segment_size = ldsAddress - ldsSize();
|
|
hsaDisp->kernel_object = gpuAqlCode();
|
|
|
|
ConstBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
|
|
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
|
hsaDisp->reserved2 = 0;
|
|
hsaDisp->completion_signal.handle = 0;
|
|
|
|
gpu.addVmMemory(cb);
|
|
gpu.addVmMemory(&prog().codeSegGpu());
|
|
for (pal::Memory* mem : prog().globalStores()) {
|
|
gpu.addVmMemory(mem);
|
|
}
|
|
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
|
|
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
|
gpu.addVmMemory(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource || prog().isStaticSampler()) {
|
|
dev().srds().fillResourceList(gpu);
|
|
}
|
|
|
|
return hsaDisp;
|
|
}
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
using llvm::AMDGPU::HSAMD::AccessQualifier;
|
|
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
|
|
using llvm::AMDGPU::HSAMD::ValueKind;
|
|
using llvm::AMDGPU::HSAMD::ValueType;
|
|
|
|
const LightningProgram& LightningKernel::prog() const {
|
|
return reinterpret_cast<const LightningProgram&>(prog_);
|
|
}
|
|
|
|
void LightningKernel::initPrintf(const std::vector<std::string>& printfInfoStrings) {
|
|
for (auto str : printfInfoStrings) {
|
|
std::vector<std::string> tokens;
|
|
|
|
size_t end, pos = 0;
|
|
do {
|
|
end = str.find_first_of(':', pos);
|
|
tokens.push_back(str.substr(pos, end - pos));
|
|
pos = end + 1;
|
|
} while (end != std::string::npos);
|
|
|
|
if (tokens.size() < 2) {
|
|
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
|
|
continue;
|
|
}
|
|
|
|
pos = 0;
|
|
size_t printfInfoID = std::stoi(tokens[pos++]);
|
|
if (printf_.size() <= printfInfoID) {
|
|
printf_.resize(printfInfoID + 1);
|
|
}
|
|
PrintfInfo& info = printf_[printfInfoID];
|
|
|
|
size_t numSizes = std::stoi(tokens[pos++]);
|
|
end = pos + numSizes;
|
|
|
|
// ensure that we have the correct number of tokens
|
|
if (tokens.size() < end + 1 /*last token is the fmtString*/) {
|
|
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
|
|
continue;
|
|
}
|
|
|
|
// push the argument sizes
|
|
while (pos < end) {
|
|
info.arguments_.push_back(std::stoi(tokens[pos++]));
|
|
}
|
|
|
|
// FIXME: We should not need this! [
|
|
std::string& fmt = tokens[pos];
|
|
bool need_nl = true;
|
|
|
|
for (pos = 0; pos < fmt.size(); ++pos) {
|
|
char symbol = fmt[pos];
|
|
need_nl = true;
|
|
if (symbol == '\\') {
|
|
switch (fmt[pos + 1]) {
|
|
case 'a':
|
|
pos++;
|
|
symbol = '\a';
|
|
break;
|
|
case 'b':
|
|
pos++;
|
|
symbol = '\b';
|
|
break;
|
|
case 'f':
|
|
pos++;
|
|
symbol = '\f';
|
|
break;
|
|
case 'n':
|
|
pos++;
|
|
symbol = '\n';
|
|
need_nl = false;
|
|
break;
|
|
case 'r':
|
|
pos++;
|
|
symbol = '\r';
|
|
break;
|
|
case 'v':
|
|
pos++;
|
|
symbol = '\v';
|
|
break;
|
|
case '7':
|
|
if (fmt[pos + 2] == '2') {
|
|
pos += 2;
|
|
symbol = '\72';
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
info.fmtString_.push_back(symbol);
|
|
}
|
|
if (need_nl) {
|
|
info.fmtString_ += "\n";
|
|
}
|
|
// ]
|
|
}
|
|
}
|
|
|
|
static inline HSAIL_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) {
|
|
switch (lcArg.mValueKind) {
|
|
case ValueKind::GlobalBuffer:
|
|
case ValueKind::DynamicSharedPointer:
|
|
case ValueKind::Pipe:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ValueKind::ByValue:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ValueKind::Image:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ValueKind::Sampler:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ValueKind::HiddenGlobalOffsetX:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
|
|
case ValueKind::HiddenGlobalOffsetY:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
|
|
case ValueKind::HiddenGlobalOffsetZ:
|
|
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
|
|
case ValueKind::HiddenPrintfBuffer:
|
|
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
|
|
case ValueKind::HiddenDefaultQueue:
|
|
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
|
|
case ValueKind::HiddenCompletionAction:
|
|
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
|
|
case ValueKind::HiddenNone:
|
|
return HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; }
|
|
|
|
static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
uint32_t align = lcArg.mPointeeAlign;
|
|
if (align == 0) {
|
|
LogWarning("Missing DynamicSharedPointer alignment");
|
|
align = 128; /* worst case alignment */
|
|
;
|
|
}
|
|
return align;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static inline HSAIL_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) {
|
|
switch (lcArg.mAccQual) {
|
|
case AccessQualifier::ReadOnly:
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
case AccessQualifier::WriteOnly:
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
case AccessQualifier::ReadWrite:
|
|
default:
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
}
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
|
|
static inline HSAIL_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
} else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
|
|
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
|
|
return HSAIL_ADDRESS_CONSTANT;
|
|
}
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
} else if (lcArg.mValueKind == ValueKind::Image ||
|
|
lcArg.mValueKind == ValueKind::Sampler ||
|
|
lcArg.mValueKind == ValueKind::Pipe) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
static inline HSAIL_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
|
|
if (lcArg.mValueKind != ValueKind::ByValue) {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
|
|
switch (lcArg.mValueType) {
|
|
case ValueType::I8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case ValueType::I16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case ValueType::I32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case ValueType::I64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case ValueType::U8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case ValueType::U16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case ValueType::U32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case ValueType::U64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case ValueType::F16:
|
|
return HSAIL_DATATYPE_F16;
|
|
case ValueType::F32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case ValueType::F64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case ValueType::Struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
|
|
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
|
|
if (lcArg.mIsVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (lcArg.mIsRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (lcArg.mIsConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
}
|
|
else if (lcArg.mIsPipe) {
|
|
assert(lcArg.mValueKind == ValueKind::Pipe);
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
|
device::Kernel::parameters_t params;
|
|
|
|
size_t offset = 0;
|
|
|
|
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
|
|
const KernelArgMD& lcArg = kernelMD.mArgs[i];
|
|
|
|
// Initialize HSAIL kernel argument
|
|
auto arg = new HSAILKernel::Argument;
|
|
arg->name_ = lcArg.mName;
|
|
arg->typeName_ = lcArg.mTypeName;
|
|
arg->size_ = lcArg.mSize;
|
|
arg->type_ = GetKernelArgType(lcArg);
|
|
arg->addrQual_ = GetKernelAddrQual(lcArg);
|
|
arg->dataType_ = GetKernelDataType(lcArg);
|
|
arg->alignment_ = GetKernelArgAlignment(lcArg);
|
|
arg->access_ = GetKernelArgAccessType(lcArg);
|
|
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
|
|
|
|
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
|
|
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
|
|
|
|
arg->index_ = isHidden ? uint(-1) : params.size();
|
|
arguments_.push_back(arg);
|
|
|
|
if (isHidden) {
|
|
continue;
|
|
}
|
|
|
|
// Initialize Device kernel parameters
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
desc.name_ = lcArg.mName.c_str();
|
|
desc.type_ = GetOclType(arg);
|
|
desc.addressQualifier_ = GetOclAddrQual(arg);
|
|
desc.accessQualifier_ = GetOclAccessQual(arg);
|
|
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
|
desc.typeName_ = lcArg.mTypeName.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
} else {
|
|
desc.size_ = arg->size_;
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the parameters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
|
|
params.push_back(desc);
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
|
|
for (const KernelMD& kernelMD : programMD->mKernels) {
|
|
if (kernelMD.mName == name) {
|
|
return &kernelMD;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
aqlCreateHWInfo(symbol);
|
|
|
|
const CodeObjectMD* programMD = prog().metadata();
|
|
assert(programMD != nullptr);
|
|
|
|
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
|
|
|
|
if (kernelMD == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Set the argList
|
|
initArgList(*kernelMD);
|
|
|
|
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
|
|
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
|
|
workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0];
|
|
workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1];
|
|
workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2];
|
|
}
|
|
|
|
if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) {
|
|
const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint;
|
|
workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0];
|
|
workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1];
|
|
workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2];
|
|
}
|
|
|
|
if (!kernelMD->mAttrs.mVecTypeHint.empty()) {
|
|
workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
|
|
|
workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
|
|
if (workGroupInfo_.size_ == 0) {
|
|
return false;
|
|
}
|
|
|
|
initPrintf(programMD->mPrintf);
|
|
|
|
/*FIXME_lmoriche:
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
|
|
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
|
|
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable();
|
|
*/
|
|
|
|
return true;
|
|
}
|
|
#endif // defined(WITH_LIGHTNING_COMPILER)
|
|
|
|
|
|
} // namespace pal
|