Files
rocm-systems/rocclr/runtime/device/pal/palkernel.cpp
T
foreman db04b3b295 P4 to Git Change 1530931 by gandryey@gera-w8 on 2018/03/22 16:38:47
SWDEV-79445 - OCL generic changes and code clean-up
	- Prepare the changes to remove resource rename feature. Resource rename can substitute a low level memory object in the current resource with a different one in order to avoid GPU stalls on frequent CPU updates, like constant buffers or staging buffers. Renaming was necessary due to CAL and old HW limitations to manage suballocations and has some inefficiency with tiny updates and staging buffers.
	- This change removes renames usage from the constant buffer management

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#45 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#78 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#43 edit
2018-03-22 16:51:43 -04:00

1583 wiersze
52 KiB
C++

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palsched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"
#include "acl.h"
#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
#include <algorithm>
namespace pal {
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') {
if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) {
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
} else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) {
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
} else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) {
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
} else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
} else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
} else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) {
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
}
return HSAIL_ARGTYPE_HIDDEN_NONE;
}
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return HSAIL_ARGTYPE_POINTER;
case ARG_TYPE_QUEUE:
return HSAIL_ARGTYPE_QUEUE;
case ARG_TYPE_VALUE:
return (argInfo->arg.value.data == DATATYPE_struct) ? HSAIL_ARGTYPE_REFERENCE
: HSAIL_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return HSAIL_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
return HSAIL_ARGTYPE_SAMPLER;
case ARG_TYPE_ERROR:
default:
return HSAIL_ARGTYPE_ERROR;
}
}
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return sizeof(void*);
case ARG_TYPE_VALUE:
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
return 1;
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2;
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4;
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8;
case DATATYPE_struct:
return 128;
case DATATYPE_ERROR:
default:
return -1;
}
case ARG_TYPE_IMAGE:
return sizeof(cl_mem);
case ARG_TYPE_SAMPLER:
return sizeof(cl_sampler);
default:
return -1;
}
}
inline static size_t GetHSAILArgPointeeAlignment(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
return argInfo->arg.pointer.align;
}
return 1;
}
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
aclAccessType accessType;
if (argInfo->type == ARG_TYPE_POINTER) {
accessType = argInfo->arg.pointer.type;
} else if (argInfo->type == ARG_TYPE_IMAGE) {
accessType = argInfo->arg.image.type;
} else {
return HSAIL_ACCESS_TYPE_NONE;
}
if (accessType == ACCESS_TYPE_RO) {
return HSAIL_ACCESS_TYPE_RO;
} else if (accessType == ACCESS_TYPE_WO) {
return HSAIL_ACCESS_TYPE_WO;
}
return HSAIL_ACCESS_TYPE_RW;
}
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
case PTR_MT_CONSTANT:
return HSAIL_ADDRESS_CONSTANT;
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return HSAIL_ADDRESS_LOCAL;
case PTR_MT_SCRATCH_EMU:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_ERROR:
default:
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
}
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
return HSAIL_ADDRESS_GLOBAL;
} else if (argInfo->type == ARG_TYPE_QUEUE) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
aclArgDataType dataType;
if (argInfo->type == ARG_TYPE_POINTER) {
dataType = argInfo->arg.pointer.data;
} else if (argInfo->type == ARG_TYPE_VALUE) {
dataType = argInfo->arg.value.data;
} else {
return HSAIL_DATATYPE_ERROR;
}
switch (dataType) {
case DATATYPE_i1:
return HSAIL_DATATYPE_B1;
case DATATYPE_i8:
return HSAIL_DATATYPE_S8;
case DATATYPE_i16:
return HSAIL_DATATYPE_S16;
case DATATYPE_i32:
return HSAIL_DATATYPE_S32;
case DATATYPE_i64:
return HSAIL_DATATYPE_S64;
case DATATYPE_u8:
return HSAIL_DATATYPE_U8;
case DATATYPE_u16:
return HSAIL_DATATYPE_U16;
case DATATYPE_u32:
return HSAIL_DATATYPE_U32;
case DATATYPE_u64:
return HSAIL_DATATYPE_U64;
case DATATYPE_f16:
return HSAIL_DATATYPE_F32;
case DATATYPE_f32:
return HSAIL_DATATYPE_F32;
case DATATYPE_f64:
return HSAIL_DATATYPE_F64;
case DATATYPE_struct:
return HSAIL_DATATYPE_STRUCT;
case DATATYPE_opaque:
return HSAIL_DATATYPE_OPAQUE;
case DATATYPE_ERROR:
default:
return HSAIL_DATATYPE_ERROR;
}
}
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return sizeof(void*);
case ARG_TYPE_VALUE:
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
case DATATYPE_struct:
return 1 * argInfo->arg.value.numElements;
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2 * argInfo->arg.value.numElements;
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4 * argInfo->arg.value.numElements;
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8 * argInfo->arg.value.numElements;
case DATATYPE_ERROR:
default:
return -1;
}
case ARG_TYPE_IMAGE:
case ARG_TYPE_SAMPLER:
case ARG_TYPE_QUEUE:
return sizeof(void*);
default:
return -1;
}
}
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
static const clk_value_type_t ClkValueMapType[6][6] = {
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
};
uint sizeType;
uint numElements;
if (arg->type_ == HSAIL_ARGTYPE_QUEUE) {
return T_QUEUE;
} else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) {
return T_POINTER;
} else if (arg->type_ == HSAIL_ARGTYPE_VALUE || arg->type_ == HSAIL_ARGTYPE_REFERENCE) {
switch (arg->dataType_) {
case HSAIL_DATATYPE_S8:
case HSAIL_DATATYPE_U8:
sizeType = 0;
numElements = arg->size_;
break;
case HSAIL_DATATYPE_S16:
case HSAIL_DATATYPE_U16:
sizeType = 1;
numElements = arg->size_ / 2;
break;
case HSAIL_DATATYPE_S32:
case HSAIL_DATATYPE_U32:
sizeType = 2;
numElements = arg->size_ / 4;
break;
case HSAIL_DATATYPE_S64:
case HSAIL_DATATYPE_U64:
sizeType = 3;
numElements = arg->size_ / 8;
break;
case HSAIL_DATATYPE_F16:
sizeType = 4;
numElements = arg->size_ / 2;
break;
case HSAIL_DATATYPE_F32:
sizeType = 4;
numElements = arg->size_ / 4;
break;
case HSAIL_DATATYPE_F64:
sizeType = 5;
numElements = arg->size_ / 8;
break;
default:
return T_VOID;
}
switch (numElements) {
case 1:
return ClkValueMapType[sizeType][0];
case 2:
return ClkValueMapType[sizeType][1];
case 3:
return ClkValueMapType[sizeType][2];
case 4:
return ClkValueMapType[sizeType][3];
case 8:
return ClkValueMapType[sizeType][4];
case 16:
return ClkValueMapType[sizeType][5];
default:
return T_VOID;
}
} else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
return T_SAMPLER;
} else {
return T_VOID;
}
}
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const HSAILKernel::Argument* arg) {
if (arg->type_ == HSAIL_ARGTYPE_POINTER) {
switch (arg->addrQual_) {
case HSAIL_ADDRESS_GLOBAL:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case HSAIL_ADDRESS_CONSTANT:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
case HSAIL_ADDRESS_LOCAL:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
} else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
}
// default for all other cases
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const HSAILKernel::Argument* arg) {
if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
switch (arg->access_) {
case HSAIL_ACCESS_TYPE_RO:
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
case HSAIL_ACCESS_TYPE_WO:
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
case HSAIL_ACCESS_TYPE_RW:
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
return CL_KERNEL_ARG_ACCESS_NONE;
}
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (argInfo->type == ARG_TYPE_POINTER) {
if (argInfo->arg.pointer.isVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (argInfo->arg.pointer.isRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
if (argInfo->arg.pointer.isPipe) {
rv |= CL_KERNEL_ARG_TYPE_PIPE;
}
if (argInfo->isConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
}
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
rv |= CL_KERNEL_ARG_TYPE_CONST;
break;
default:
break;
}
}
return rv;
}
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
if (!sym) {
return false;
}
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
return false;
}
amd_kernel_code_t* akc =
reinterpret_cast<amd_kernel_code_t*>(prog().findHostKernelAddress(code_));
cpuAqlCode_ = akc;
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
reinterpret_cast<void*>(&codeSize_))) {
return false;
}
size_t akc_align = 0;
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
reinterpret_cast<void*>(&akc_align))) {
return false;
}
assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned");
workGroupInfo_.scratchRegs_ =
amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
akc->workgroup_group_segment_byte_size;
workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
workGroupInfo_.usedStackSize_ = 0;
workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
if (!prog().isNull()) {
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
workGroupInfo_.availableSGPRs_ =
dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
workGroupInfo_.availableVGPRs_ =
dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
dev().info().wavefrontWidth_;
} else {
workGroupInfo_.availableLDSSize_ = 64 * Ki;
workGroupInfo_.availableSGPRs_ = 104;
workGroupInfo_.availableVGPRs_ = 256;
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
}
return true;
}
void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Initialize the hsail argument list too
initHsailArgs(aclArg);
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
// skip the hidden arguments
if (arguments_[i]->index_ == uint(-1)) continue;
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(arguments_[i]);
desc.addressQualifier_ = GetOclAddrQual(arguments_[i]);
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
} else {
desc.size_ = arguments_[i]->size_;
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
flags_.imageEna_ = true;
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
flags_.imageWriteEna_ = true;
}
}
}
createSignature(params);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
// Iterate through the each kernel argument
for (uint index = 0; aclArg->struct_size != 0; aclArg++) {
Argument* arg = new Argument;
// Initialize HSAIL kernel argument
arg->name_ = aclArg->argStr;
arg->typeName_ = aclArg->typeStr;
arg->size_ = GetHSAILArgSize(aclArg);
arg->type_ = GetHSAILArgType(aclArg);
arg->addrQual_ = GetHSAILAddrQual(aclArg);
arg->dataType_ = GetHSAILDataType(aclArg);
arg->alignment_ = GetHSAILArgAlignment(aclArg);
arg->access_ = GetHSAILArgAccessType(aclArg);
arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg);
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
arg->index_ = isHidden ? uint(-1) : index++;
arguments_.push_back(arg);
}
}
void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) {
PrintfInfo info;
uint index = 0;
for (; aclPrintf->struct_size != 0; aclPrintf++) {
index = aclPrintf->ID;
if (printf_.size() <= index) {
printf_.resize(index + 1);
}
std::string pfmt = aclPrintf->fmtStr;
info.fmtString_.clear();
bool need_nl = true;
for (size_t pos = 0; pos < pfmt.size(); ++pos) {
char symbol = pfmt[pos];
need_nl = true;
if (symbol == '\\') {
// Rest of the C escape sequences (e.g. \') are handled correctly
// by the MDParser, we are not sure exactly how!
switch (pfmt[pos + 1]) {
case 'a':
pos++;
symbol = '\a';
break;
case 'b':
pos++;
symbol = '\b';
break;
case 'f':
pos++;
symbol = '\f';
break;
case 'n':
pos++;
symbol = '\n';
need_nl = false;
break;
case 'r':
pos++;
symbol = '\r';
break;
case 'v':
pos++;
symbol = '\v';
break;
case '7':
if (pfmt[pos + 2] == '2') {
pos += 2;
symbol = '\72';
}
break;
default:
break;
}
}
info.fmtString_.push_back(symbol);
}
if (need_nl) {
info.fmtString_ += "\n";
}
uint32_t* tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) {
info.arguments_.push_back(*tmp_ptr);
}
printf_[index] = info;
info.arguments_.clear();
}
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
: device::Kernel(name),
compileOptions_(compileOptions),
dev_(prog->dev()),
prog_(*prog),
index_(0),
code_(0),
codeSize_(0),
waveLimiter_(
this,
(prog->isNull() ? 1
: dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
dev().hwInfo()->simdPerCU_) {
hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
while (!arguments_.empty()) {
Argument* arg = arguments_.back();
delete arg;
arguments_.pop_back();
}
}
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
#if defined(WITH_LIGHTNING_COMPILER)
assert(!"Should not reach here");
#else // !defined(WITH_LIGHTNING_COMPILER)
acl_error error = ACL_SUCCESS;
std::string openClKernelName = openclMangledName(name());
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
// compile kernel down to ISA
if (finalize) {
std::string options(compileOptions_.c_str());
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (dev().settings().svmFineGrainSystem_) {
options.append(" -sc-xnack-iommu");
}
error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
ACL_TYPE_ISA, nullptr);
buildLog_ += aclGetCompilerLog(dev().compiler());
if (error != ACL_SUCCESS) {
LogError("Failed to finalize kernel");
return false;
}
}
aqlCreateHWInfo(sym);
// Pull out metadata from the ELF
size_t sizeOfArgList;
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
char* aclArgList = new char[sizeOfArgList];
if (nullptr == aclArgList) {
return false;
}
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the argList
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
delete[] aclArgList;
size_t sizeOfWorkGroupSize;
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
} else {
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Make sure kernel has any printf info
if (0 != sizeOfPrintfList) {
char* aclPrintfList = new char[sizeOfPrintfList];
if (nullptr == aclPrintfList) {
return false;
}
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the PrintfList
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
delete[] aclPrintfList;
}
aclMetadata md;
md.enqueue_kernel = false;
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
if (error != ACL_SUCCESS) {
return false;
}
flags_.dynamicParallelism_ = md.enqueue_kernel;
md.kernel_index = -1;
size_t sizeOfIndex = sizeof(md.kernel_index);
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
if (error != ACL_SUCCESS) {
return false;
}
index_ = md.kernel_index;
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
&sizeOfWavesPerSimdHint);
if (error != ACL_SUCCESS) {
return false;
}
waveLimiter_.enable();
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
&sizeOfWorkGroupSizeHint);
if (error != ACL_SUCCESS) {
return false;
}
size_t sizeOfVecTypeHint;
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
if (0 != sizeOfVecTypeHint) {
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
if (NULL == VecTypeHint) {
return false;
}
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
VecTypeHint[sizeOfVecTypeHint] = '\0';
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
delete[] VecTypeHint;
}
#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
}
bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const {
// HSAIL path doesn't require memory reallocations
return true;
}
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast<const HSAILProgram&>(prog_);
}
void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const {
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && ((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
} else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
} else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch))) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
} else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
} else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
} else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
} else {
assert(0 && "Invalid workDim!");
}
}
}
} else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
inline static void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const void* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment = 0 //!< The alignment to follow while writing to the buffer
) {
if (alignment == 0) {
*dst = amd::alignUp(*dst, size);
} else {
*dst = amd::alignUp(*dst, alignment);
}
memcpy(*dst, src, size);
*dst += size;
}
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_BUSY;
ManagedBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(sizeof(AmdAqlWrap));
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
gpu.addVmMemory(cb->activeMemory());
}
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Find all parameters for the current kernel
for (auto arg : arguments_) {
const_address paramaddr = nullptr;
if (arg->index_ != uint(-1)) {
paramaddr = parameters + signature.at(arg->index_).offset_;
}
// Handle the hidden arguments first, as they do not have a
// matching parameter in the OCL signature (not a valid arg->index_)
switch (arg->type_) {
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
size_t bufferPtr = 0;
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
// and set the fourth argument as the printf_buffer pointer
bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
}
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
break;
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
break;
case HSAIL_ARGTYPE_HIDDEN_NONE: {
void* zero = 0;
assert(arg->size_ <= sizeof(zero) && "check the sizes");
WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
break;
}
case HSAIL_ARGTYPE_POINTER: {
// If it is a local pointer
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
break;
}
assert(
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
"Unsupported address qualifier");
// If it is a global pointer
Memory* gpuMem = nullptr;
amd::Memory* mem = nullptr;
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
gpuMem->wait(gpu, WaitOnBusyEngine);
if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
mem->signalWrite(&dev());
}
gpu.addVmMemory(gpuMem);
}
// If finegrainsystem is present then the pointer can be malloced by the app and
// passed to kernel directly. If so copy the pointer location to aqlArgBuf
else if (!dev().isFineGrainedSystem(true)) {
return nullptr;
}
break;
}
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
if (nullptr != gpuMem) {
mem = gpuMem->owner();
}
} else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
}
}
if (gpuMem == nullptr) {
WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_);
break;
}
//! 64 bit isn't supported with 32 bit binary
uint64_t globalAddress = gpuMem->vmAddress();
WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_);
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, WaitOnBusyEngine);
//! @todo Compiler has to return read/write attributes
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
gpu.addVmMemory(gpuMem);
// save the memory object pointer to allow global memory access
if (nullptr != dev().hwDebugMgr()) {
dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner());
}
break;
}
case HSAIL_ARGTYPE_REFERENCE: {
// Copy the current structure into CB1
memcpy(aqlStruct, paramaddr, arg->size_);
ManagedBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(arg->size_);
// Then use a pointer in aqlArgBuffer to CB1
size_t gpuPtr = static_cast<size_t>(cb->vmAddress() + cb->wrtOffset());
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t));
gpu.addVmMemory(cb->activeMemory());
break;
}
case HSAIL_ARGTYPE_VALUE:
WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = nullptr;
amd::Memory* mem = nullptr;
if (nativeMem) {
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
} else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem == nullptr) {
LogError("The kernel image argument isn't an image object!");
return nullptr;
}
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ManagedBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
gpu.addVmMemory(cb->activeMemory());
} else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
gpu.addVmMemory(image);
if (image->desc().isDoppTexture_) {
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
}
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
const amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(paramaddr);
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
const amd::DeviceQueue* queue = *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!gpu.createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return nullptr;
}
vmQueue = gpu.vQueue()->vmAddress();
}
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue));
break;
}
default:
LogError(" Unsupported argument type ");
return nullptr;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return nullptr;
}
#if defined(WITH_LIGHTNING_COMPILER)
// Check there is no arguments' buffer overflow. We may not use all the
// hidden argument slots.
assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize()));
#else // !defined(WITH_LIGHTNING_COMPILER)
// HSAIL kernarg segment size is rounded up to multiple of 16.
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
#endif // !defined(WITH_LIGHTNING_COMPILER)
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->sysMemCopy() + argsBufferSize());
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
hsaDisp->workgroup_size_x = local[0];
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
hsaDisp->grid_size_x = global[0];
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
hsaDisp->reserved2 = 0;
// Initialize kernel ISA and execution buffer requirements
hsaDisp->private_segment_size = spillSegSize();
hsaDisp->group_segment_size = ldsAddress - ldsSize();
hsaDisp->kernel_object = gpuAqlCode();
ManagedBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
hsaDisp->reserved2 = 0;
hsaDisp->completion_signal.handle = 0;
gpu.addVmMemory(cb->activeMemory());
gpu.addVmMemory(&prog().codeSegGpu());
for (pal::Memory* mem : prog().globalStores()) {
gpu.addVmMemory(mem);
}
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
gpu.addVmMemory(gpu.hsaQueueMem());
}
if (srdResource || prog().isStaticSampler()) {
dev().srds().fillResourceList(gpu);
}
return hsaDisp;
}
#if defined(WITH_LIGHTNING_COMPILER)
using llvm::AMDGPU::HSAMD::AccessQualifier;
using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
using llvm::AMDGPU::HSAMD::ValueKind;
using llvm::AMDGPU::HSAMD::ValueType;
const LightningProgram& LightningKernel::prog() const {
return reinterpret_cast<const LightningProgram&>(prog_);
}
void LightningKernel::initPrintf(const std::vector<std::string>& printfInfoStrings) {
for (auto str : printfInfoStrings) {
std::vector<std::string> tokens;
size_t end, pos = 0;
do {
end = str.find_first_of(':', pos);
tokens.push_back(str.substr(pos, end - pos));
pos = end + 1;
} while (end != std::string::npos);
if (tokens.size() < 2) {
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
continue;
}
pos = 0;
size_t printfInfoID = std::stoi(tokens[pos++]);
if (printf_.size() <= printfInfoID) {
printf_.resize(printfInfoID + 1);
}
PrintfInfo& info = printf_[printfInfoID];
size_t numSizes = std::stoi(tokens[pos++]);
end = pos + numSizes;
// ensure that we have the correct number of tokens
if (tokens.size() < end + 1 /*last token is the fmtString*/) {
LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
continue;
}
// push the argument sizes
while (pos < end) {
info.arguments_.push_back(std::stoi(tokens[pos++]));
}
// FIXME: We should not need this! [
std::string& fmt = tokens[pos];
bool need_nl = true;
for (pos = 0; pos < fmt.size(); ++pos) {
char symbol = fmt[pos];
need_nl = true;
if (symbol == '\\') {
switch (fmt[pos + 1]) {
case 'a':
pos++;
symbol = '\a';
break;
case 'b':
pos++;
symbol = '\b';
break;
case 'f':
pos++;
symbol = '\f';
break;
case 'n':
pos++;
symbol = '\n';
need_nl = false;
break;
case 'r':
pos++;
symbol = '\r';
break;
case 'v':
pos++;
symbol = '\v';
break;
case '7':
if (fmt[pos + 2] == '2') {
pos += 2;
symbol = '\72';
}
break;
default:
break;
}
}
info.fmtString_.push_back(symbol);
}
if (need_nl) {
info.fmtString_ += "\n";
}
// ]
}
}
static inline HSAIL_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) {
switch (lcArg.mValueKind) {
case ValueKind::GlobalBuffer:
case ValueKind::DynamicSharedPointer:
case ValueKind::Pipe:
return HSAIL_ARGTYPE_POINTER;
case ValueKind::ByValue:
return HSAIL_ARGTYPE_VALUE;
case ValueKind::Image:
return HSAIL_ARGTYPE_IMAGE;
case ValueKind::Sampler:
return HSAIL_ARGTYPE_SAMPLER;
case ValueKind::HiddenGlobalOffsetX:
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
case ValueKind::HiddenGlobalOffsetY:
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
case ValueKind::HiddenGlobalOffsetZ:
return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
case ValueKind::HiddenPrintfBuffer:
return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER;
case ValueKind::HiddenDefaultQueue:
return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
case ValueKind::HiddenCompletionAction:
return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION;
case ValueKind::HiddenNone:
return HSAIL_ARGTYPE_HIDDEN_NONE;
default:
return HSAIL_ARGTYPE_ERROR;
}
}
static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; }
static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
uint32_t align = lcArg.mPointeeAlign;
if (align == 0) {
LogWarning("Missing DynamicSharedPointer alignment");
align = 128; /* worst case alignment */
;
}
return align;
}
return 1;
}
static inline HSAIL_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) {
switch (lcArg.mAccQual) {
case AccessQualifier::ReadOnly:
return HSAIL_ACCESS_TYPE_RO;
case AccessQualifier::WriteOnly:
return HSAIL_ACCESS_TYPE_WO;
case AccessQualifier::ReadWrite:
default:
return HSAIL_ACCESS_TYPE_RW;
}
}
return HSAIL_ACCESS_TYPE_NONE;
}
static inline HSAIL_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) {
if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
return HSAIL_ADDRESS_LOCAL;
} else if (lcArg.mValueKind == ValueKind::GlobalBuffer) {
if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) {
return HSAIL_ADDRESS_GLOBAL;
} else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) {
return HSAIL_ADDRESS_CONSTANT;
}
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
} else if (lcArg.mValueKind == ValueKind::Image ||
lcArg.mValueKind == ValueKind::Sampler ||
lcArg.mValueKind == ValueKind::Pipe) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
static inline HSAIL_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
if (lcArg.mValueKind != ValueKind::ByValue) {
return HSAIL_DATATYPE_ERROR;
}
switch (lcArg.mValueType) {
case ValueType::I8:
return HSAIL_DATATYPE_S8;
case ValueType::I16:
return HSAIL_DATATYPE_S16;
case ValueType::I32:
return HSAIL_DATATYPE_S32;
case ValueType::I64:
return HSAIL_DATATYPE_S64;
case ValueType::U8:
return HSAIL_DATATYPE_U8;
case ValueType::U16:
return HSAIL_DATATYPE_U16;
case ValueType::U32:
return HSAIL_DATATYPE_U32;
case ValueType::U64:
return HSAIL_DATATYPE_U64;
case ValueType::F16:
return HSAIL_DATATYPE_F16;
case ValueType::F32:
return HSAIL_DATATYPE_F32;
case ValueType::F64:
return HSAIL_DATATYPE_F64;
case ValueType::Struct:
return HSAIL_DATATYPE_STRUCT;
default:
return HSAIL_DATATYPE_ERROR;
}
}
static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (lcArg.mValueKind == ValueKind::GlobalBuffer ||
lcArg.mValueKind == ValueKind::DynamicSharedPointer) {
if (lcArg.mIsVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (lcArg.mIsRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
if (lcArg.mIsConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
}
}
else if (lcArg.mIsPipe) {
assert(lcArg.mValueKind == ValueKind::Pipe);
rv |= CL_KERNEL_ARG_TYPE_PIPE;
}
return rv;
}
void LightningKernel::initArgList(const KernelMD& kernelMD) {
device::Kernel::parameters_t params;
size_t offset = 0;
for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) {
const KernelArgMD& lcArg = kernelMD.mArgs[i];
// Initialize HSAIL kernel argument
auto arg = new HSAILKernel::Argument;
arg->name_ = lcArg.mName;
arg->typeName_ = lcArg.mTypeName;
arg->size_ = lcArg.mSize;
arg->type_ = GetKernelArgType(lcArg);
arg->addrQual_ = GetKernelAddrQual(lcArg);
arg->dataType_ = GetKernelDataType(lcArg);
arg->alignment_ = GetKernelArgAlignment(lcArg);
arg->access_ = GetKernelArgAccessType(lcArg);
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION ||
arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE;
arg->index_ = isHidden ? uint(-1) : params.size();
arguments_.push_back(arg);
if (isHidden) {
continue;
}
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
desc.name_ = lcArg.mName.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
} else {
desc.size_ = arg->size_;
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the parameters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
createSignature(params);
}
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
for (const KernelMD& kernelMD : programMD->mKernels) {
if (kernelMD.mName == name) {
return &kernelMD;
}
}
return nullptr;
}
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
aqlCreateHWInfo(symbol);
const CodeObjectMD* programMD = prog().metadata();
assert(programMD != nullptr);
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
if (kernelMD == nullptr) {
return false;
}
// Set the argList
initArgList(*kernelMD);
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0];
workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1];
workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2];
}
if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) {
const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint;
workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0];
workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1];
workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2];
}
if (!kernelMD->mAttrs.mVecTypeHint.empty()) {
workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
if (workGroupInfo_.size_ == 0) {
return false;
}
initPrintf(programMD->mPrintf);
/*FIXME_lmoriche:
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
if (error != ACL_SUCCESS) {
return false;
}
waveLimiter_.enable();
*/
return true;
}
#endif // defined(WITH_LIGHTNING_COMPILER)
} // namespace pal