Files
rocm-systems/rocclr/runtime/device/gpu/gpukernel.cpp
T
foreman d1abc0baa2 P4 to Git Change 1808507 by axie@axie-pal-lc4 on 2019/06/11 14:11:09
SWDEV-187399 - ​Linux-Pro Pyro_Explosion.hip Image difference is observed specific to W9000 ASIC in OCL mode

	The issue disappeared when we invalidate L1 cache after each dispatch.

	This change also fix ocltst runtime OCLMultiQueue test in Tahiti Brahma driver.

	Tests:
	http://ocltc.amd.com:8111/viewModification.html?modId=121519&personal=true&tab=vcsModificationBuilds

	Test in the Houdini app. The symptom disappeared.
	This change also fix ocltst runtime OCLMultiQueue test in Tahiti Brahma driver.

	ReviewBoard: http://ocltc.amd.com/reviews/r/17509/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#337 edit
2019-06-12 05:55:12 -04:00

3506 rindas
112 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "shader/ComputeProgramObject.h"
#include "utils/options.hpp"
#include "acl.h"
#include "SCCommon.h"
#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
#include <algorithm>
namespace gpu {
const MetaDataConst ArgState[ArgStateTotal] = {
// Note: the order is important
// Name Type Properties
// Kernel description (special properties)
{"memory:compilerwrite", KernelArg::PrivateFixed, {0, 0, 0, 0, 0, 0, 0}},
{"uniqueid:", KernelArg::NoType, {0, 0, 0, 0, 0, 0, 0}},
{"memory:private:", KernelArg::PrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:local:", KernelArg::LocalSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:hwprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:uavprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:hwlocal:", KernelArg::HwLocalSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:64bitABI", KernelArg::ABI64Bit, {0, 0, 0, 0, 0, 0, 0}},
{"limitgroupsize", KernelArg::Wavefront, {0, 0, 0, 0, 0, 0, 0}},
{"function:", KernelArg::NoType, {1, 1, 0, 0, 0, 0, 0}},
{"intrinsic:", KernelArg::NoType, {1, 0, 0, 0, 0, 0, 0}},
{"error:", KernelArg::ErrorMessage, {0, 0, 0, 0, 0, 0, 0}},
{"warning:", KernelArg::WarningMessage, {0, 0, 0, 0, 0, 0, 0}},
{"printf_fmt:", KernelArg::PrintfFormatStr, {0, 0, 0, 0, 0, 0, 0}},
{"version:", KernelArg::MetadataVersion, {0, 0, 0, 0, 0, 0, 0}},
// Kernel basic types
{"pointer:", KernelArg::PointerGlobal, {1, 1, 1, 1, 1, 1, 0}},
{"value:", KernelArg::Value, {1, 1, 1, 1, 1, 0, 0}},
{"image:", KernelArg::Image, {1, 1, 1, 1, 1, 0, 0}},
{"sampler:", KernelArg::Sampler, {0, 1, 0, 0, 0, 0, 0}},
{"counter:", KernelArg::Counter, {1, 1, 0, 1, 1, 0, 0}},
{"cws:", KernelArg::Grouping, {0, 0, 0, 0, 0, 0, 0}},
{"lws:", KernelArg::WrkgrpSize, {0, 0, 0, 0, 0, 0, 0}},
{"uavid:", KernelArg::UavId, {0, 0, 0, 0, 0, 0, 0}},
{"reflection:", KernelArg::Reflection, {0, 0, 0, 0, 0, 0, 0}},
{"constarg:", KernelArg::ConstArg, {0, 0, 0, 0, 0, 0, 0}},
{"cbid:", KernelArg::ConstBufId, {0, 0, 0, 0, 0, 0, 0}},
{"printfid:", KernelArg::PrintfBufId, {0, 0, 0, 0, 0, 0, 0}},
{"wsh:", KernelArg::GroupingHint, {0, 0, 0, 0, 0, 0, 0}},
{"vth:", KernelArg::VecTypeHint, {0, 0, 0, 0, 0, 0, 0}},
{"WavesPerSimdHint:", KernelArg::WavesPerSimdHint, {0, 0, 0, 0, 0, 0, 0}},
};
const DataTypeConst DataType[] = {
{
"i8:", KernelArg::Char,
},
{
"i16:", KernelArg::Short,
},
{
"i32:", KernelArg::Int,
},
{
"i64:", KernelArg::Long,
},
{
"u8:", KernelArg::UChar,
},
{
"u16:", KernelArg::UShort,
},
{
"u32:", KernelArg::UInt,
},
{
"u64:", KernelArg::ULong,
},
{
"float:", KernelArg::Float,
},
{
"double:", KernelArg::Double,
},
{
"struct:", KernelArg::Struct,
},
{
"union:", KernelArg::Union,
},
{
"1D:", KernelArg::Image1D,
},
{
"2D:", KernelArg::Image2D,
},
{
"3D:", KernelArg::Image3D,
},
{
"1DB:", KernelArg::Image1DB,
},
{
"1DA:", KernelArg::Image1DA,
},
{
"2DA:", KernelArg::Image2DA,
},
{
"opaque:", KernelArg::Opaque,
},
{
"event:", KernelArg::Event,
},
{
"sampler:", KernelArg::Sampler,
},
{
"half:", KernelArg::Half,
},
};
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
struct BufDataConst {
const char* tagName_; //!< buffer's name
KernelArg::ArgumentType type_; //!< type of argument
struct {
uint number_ : 1; //!< buffer's number
uint alignment_ : 1; //!< buffer's alignment
uint attribute_ : 1; //!< buffer's read/write attribute
uint reserved : 29; //!< reserved
};
};
static const BufDataConst BufType[] = {{"g", KernelArg::PointerGlobal, {1, 0, 0, 0}},
{"p", KernelArg::PointerPrivate, {1, 1, 1, 0}},
{"l", KernelArg::PointerLocal, {1, 1, 1, 0}},
{"uav", KernelArg::PointerGlobal, {1, 1, 1, 0}},
{"c", KernelArg::PointerConst, {1, 1, 1, 0}},
{"hl", KernelArg::PointerHwLocal, {1, 1, 1, 0}},
{"hp", KernelArg::PointerHwPrivate, {1, 1, 1, 0}},
{"hc", KernelArg::PointerHwConst, {1, 1, 1, 0}}};
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
//! The mathlib constants for each kernel execution
static const float MathLibConst[4] = {0.0f, 0.5f, 1.0f, 2.0f};
bool expect(const std::string& str, size_t* pos, const std::string& sym) {
bool result = true;
uint i;
if (*pos == std::string::npos) {
return false;
}
// Check if we have expected symbols
for (i = 0; i < sym.size(); ++i) {
char deb = str[*pos + i];
if (deb != sym[i]) {
result = false;
break;
}
}
if (result) *pos += i;
return result;
}
bool getword(const std::string& str, size_t* pos, std::string& sym) {
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;", *pos);
size_t count = posEnd - *pos;
if (count != 0) {
sym = str.substr(*pos, count);
}
sym[count] = 0;
*pos = posEnd + 1;
return true;
}
bool getstring(const std::string& str, size_t* pos, std::string* out) {
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(":\n\r;", *pos);
size_t count = posEnd - *pos;
char* sym = new char[count + 1];
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*out = sym;
delete[] sym;
*pos = posEnd + 1;
return true;
}
bool getuint(const std::string& str, size_t* pos, uint* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
*val = (*val * 10) + (sym[i] - 0x30);
}
*pos = posEnd + 1;
return true;
}
bool getuintHex(const std::string& str, size_t* pos, uint* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
} else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
} else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
void intToStr(size_t value, char* str, size_t size) {
static const uint MaxDigits32bit = 10;
char result[MaxDigits32bit];
uint idx = MaxDigits32bit;
do {
idx--;
result[idx] = static_cast<char>((value % 10) + '0');
value /= 10;
} while ((value != 0) && (idx > 0));
size_t len = MaxDigits32bit - idx;
size_t n = std::min<size_t>(len, size - 1);
memcpy(str, &result[idx], n);
str[n] = '\0';
}
//! Default destructor
CalImageReference::~CalImageReference() {
// Free CAL image
free(image_);
}
KernelArg::KernelArg()
: type_(KernelArg::NoType),
size_(0),
cbIdx_(0),
cbPos_(0),
index_(0),
alignment_(1),
dataType_(KernelArg::NoType) {
name_ = "";
buf_ = "";
memory_.value_ = 0;
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
}
KernelArg::KernelArg(const KernelArg& data) {
// Fill the new object
*this = data;
}
KernelArg& KernelArg::operator=(const KernelArg& data) {
// Fill the fields of the current object
name_ = data.name_;
typeName_ = data.typeName_;
typeQualifier_ = data.typeQualifier_;
type_ = data.type_;
size_ = data.size_;
cbIdx_ = data.cbIdx_;
cbPos_ = data.cbPos_;
buf_ = data.buf_;
index_ = data.index_;
alignment_ = data.alignment_;
dataType_ = data.dataType_;
memory_.value_ = data.memory_.value_;
return *this;
}
bool KernelArg::isCbNeeded() const {
//! \note not a safe way
bool result = ((type_ > NoType) && (type_ < Sampler)) ? true : false;
if ((type_ == Sampler) && (location_ == 0)) {
// Sampler is defined outside the kernel
result = true;
}
return result;
}
size_t KernelArg::size(bool gpuLayer) const {
switch (type_) {
case NoType:
return 0;
case PointerConst:
case PointerHwConst:
case PointerGlobal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
case Sampler:
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
case Counter:
return (gpuLayer) ? 0 : sizeof(cl_mem);
case PointerLocal:
case PointerHwLocal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
case PointerPrivate:
case PointerHwPrivate:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
case Float:
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
case Double:
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
case Char:
case UChar:
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
case Short:
case UShort:
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
case Int:
case UInt:
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
case Long:
case ULong:
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
case Struct:
case Union:
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
default:
return 0;
}
}
cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const {
switch (type_) {
case PointerGlobal:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PointerLocal:
case PointerHwLocal:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
case PointerConst:
case PointerHwConst:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
}
cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const {
switch (type_) {
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
if (memory_.readOnly_) {
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
} else if (memory_.writeOnly_) {
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
} else if (memory_.readWrite_) {
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
}
// Fall through ...
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
//! temporary solution for the vectors handling in compiler
size_t KernelArg::specialVector() const {
if (size_ > VectorSizeLimit) {
switch (type_) {
case Char:
case UChar:
return sizeof(cl_char);
case Short:
case UShort:
return sizeof(cl_short);
default:
return 0;
}
}
return 0;
}
clk_value_type_t KernelArg::type() const {
switch (type_) {
case PointerGlobal:
case PointerLocal:
case PointerHwLocal:
case PointerConst:
case PointerHwConst:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
case Counter:
return T_POINTER;
case Float:
return T_FLOAT;
case Double:
return T_DOUBLE;
case Char:
case UChar:
return T_CHAR;
case Short:
case UShort:
return T_SHORT;
case Int:
return T_INT;
case UInt:
//! \note No UINT type
return T_INT;
case Long:
return T_LONG;
case ULong:
//! \note No ULONG type
return T_LONG;
case Struct:
case Union:
//! @todo What should we report?
return T_CHAR;
case Sampler:
return T_SAMPLER;
case PointerPrivate:
case PointerHwPrivate:
case NoType:
default:
return T_VOID;
}
}
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
const NullProgram& nullprog)
: device::Kernel(gpuNullDev, name, nullprog),
buildError_(CL_BUILD_PROGRAM_FAILURE),
gpuDev_(gpuNullDev),
calRef_(NULL),
internal_(false),
flags_(0),
cbSizes_(NULL),
numCb_(0),
rwAttributes_(false),
instructionCnt_(4) {
// UAV raw index will be detected
uavRaw_ = UavIdUndefined;
// CB index will be detected
cbId_ = UavIdUndefined;
// Printf index will be detected
printfId_ = UavIdUndefined;
}
NullKernel::~NullKernel() {
uint idx;
if (calRef_ == NULL) {
return;
}
calRef_->release();
// Destroy all kernel arguments
for (idx = 0; idx < arguments_.size(); ++idx) {
delete arguments_[idx];
}
arguments_.clear();
// Destroy all sampler kernel arguments
for (idx = 0; idx < intSamplers_.size(); ++idx) {
delete intSamplers_[idx];
}
intSamplers_.clear();
}
static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) {
switch (dstComp) {
case SC_COMPONENT_X:
return 0;
case SC_COMPONENT_Y:
return 1;
case SC_COMPONENT_Z:
return 2;
case SC_COMPONENT_W:
return 3;
}
return 0;
}
static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) {
uint count = shader->dep.NumIntrlIConstants;
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
// only suppport loop consts (int consts)
if (count) {
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
uint usedConsts = 0;
for (uint i = 0; i < count; ++i) {
uint currentConst;
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
if (allocatedconsts[currentConst].addr ==
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
break;
}
}
if (currentConst == usedConsts) {
usedConsts++;
assert(usedConsts <= count);
}
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
allocatedconsts[currentConst].type = AMU_ABI_INT32;
allocatedconsts[currentConst].value.int32[scComponentToArrayIndex(
HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
}
encoding.litConstsCount = usedConsts;
}
}
bool NullKernel::create(const std::string& code, const std::string& metadata,
const void* binaryCode, size_t binarySize) {
std::auto_ptr<uint> uavRefCount(new uint[MaxUavArguments]);
if (NULL == uavRefCount.get()) {
return false;
}
// Set all ref counts to 0
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
// parse the metadata fields
if (!parseArguments(metadata, uavRefCount.get())) {
return false;
}
CALimage calImage;
// Save source if DEBUG build
#if DEBUG
ilSource_ = code;
#endif // DEBUG
amd::option::Options* options = nullProg().getCompilerOptions();
internal_ = options->oVariables->clInternalKernel;
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
acl_error err;
std::string arch = "amdil";
if (nullDev().settings().use64BitPtr_) {
arch += "64";
}
aclTargetInfo info = aclGetTargetInfo(arch.c_str(), nullDev().hwInfo()->targetName_, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclGetTargetInfo failed");
return false;
}
aclBinaryOptions binOpts = {0};
binOpts.struct_size = sizeof(binOpts);
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
binOpts.bitness = ELFDATA2LSB;
binOpts.alloc = &::malloc;
binOpts.dealloc = &::free;
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclBinaryInit failed");
return false;
}
if (ACL_SUCCESS !=
aclInsertSection(nullDev().amdilCompiler(), bin, code.data(), code.size(), aclSOURCE)) {
LogWarning("aclInsertSection failed");
aclBinaryFini(bin);
return false;
}
amd::option::Options* Opts = (amd::option::Options*)bin->options;
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (nullDev().settings().svmFineGrainSystem_) {
options->origOptionStr.append(" -sc-xnack-iommu");
}
// temporary solution to synchronize buildNo between runtime and complib
// until we move runtime inside complib
Opts->setBuildNo(options->getBuildNo());
// pass kernel name to compiler
Opts->setCurrKernelName(name().c_str());
err = aclCompile(nullDev().amdilCompiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT,
ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(nullDev().amdilCompiler());
if (err != ACL_SUCCESS) {
LogWarning("aclCompile failed");
aclBinaryFini(bin);
return false;
}
if (!options->oVariables->BinEXE) {
// Early exit if binary doesn't contain EXE
aclBinaryFini(bin);
return true;
}
size_t len;
const void* isa = aclExtractSection(nullDev().amdilCompiler(), bin, &len, aclTEXT, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclExtractSection failed");
aclBinaryFini(bin);
return false;
}
uint calImageSize;
if (!createMultiBinary(&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
LogWarning("initSrcEncoding failed");
aclBinaryFini(bin);
return false;
}
aclBinaryFini(bin);
} else if ((binaryCode != NULL) && (binarySize != 0)) {
uint size = 0;
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode)) || size > binarySize) {
buildLog_ += "Invalid binary image";
LogError("amuABIMultiBinaryGetSize failed!");
return false;
}
calImage = static_cast<CALimage>(malloc(size));
::memcpy(calImage, binaryCode, size);
} else {
LogError("Incorrect initialization parameters!");
return false;
}
calRef_ = new CalImageReference(calImage);
if (calRef_ == NULL) {
LogError("Memory allocation failure!");
// Free CAL image
free(calImage);
return false;
}
CALfuncInfo calFuncInfo;
// Get kernel compiled information
getFuncInfoFromImage(calImage, &calFuncInfo);
if (calFuncInfo.maxScratchRegsNeeded > 0) {
LogPrintfInfo(
"%s kernel has register spilling."
"Lower performance is expected.",
name().c_str());
}
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
return false;
}
return true;
}
size_t NullKernel::getCalBinarySize() const {
CALuint imageSize;
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
LogError("Failed to get the image size!");
return 0;
}
return static_cast<size_t>(imageSize);
}
bool NullKernel::getCalBinary(void* binary, size_t size) const {
uint calImageSize = 0;
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) {
LogError("CAL failed to save the kernel binary!");
return false;
}
::memcpy(binary, calImage(), calImageSize);
return true;
}
bool Kernel::create(const std::string& code, const std::string& metadata, const void* binaryCode,
size_t binarySize) {
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
return false;
}
// initialize constant buffer sizes
if (!initConstBuffers()) {
return false;
}
// Initialize the kernel parameters
bool result = initParameters();
// Wave limiter needs to be initialized after kernel metadata is parsed
// Since it depends on it.
waveLimiter_.enable(dev().settings().ciPlus_);
if (result) {
buildError_ = CL_SUCCESS;
} else {
result = false;
}
return result;
}
Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog) {
hwPrivateSize_ = 0;
if (NULL != initData) {
flags_ = initData->flags_;
hwPrivateSize_ = initData->hwPrivateSize_;
hwLocalSize_ = initData->hwLocalSize_;
}
// Workgroup info private memory size
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
// Default wavesPerSimdHint_
workGroupInfo_.wavesPerSimdHint_ = ~0U;
}
Kernel::~Kernel() {
if (calRef_ == NULL) {
return;
}
{
Device::ScopedLockVgpus lock(dev());
// Release all virtual image objects on all virtual GPUs
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->releaseKernel(calImage());
}
}
if (0 != numCb_) {
delete[] cbSizes_;
}
}
const Device& Kernel::dev() const { return reinterpret_cast<const Device&>(gpuDev_); }
const Program& Kernel::prog() const { return reinterpret_cast<const Program&>(prog_); }
bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) {
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
bool result = false;
AMUabiAddEncoding encoding;
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments +
sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers +
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
char* tmpMem = new char[allocSize];
if (tmpMem == NULL) {
LogError("Error allocating memory");
return false;
}
CalcPtr(encoding.inputs, tmpMem, 0, 0);
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry),
MaxSamplers);
if (shader->dep.NumIntrlIConstants != 0) {
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask),
MaxConstBuffers);
}
AMUabiMultiBinary amuBinary;
amuABIMultiBinaryCreate(&amuBinary);
result = siCreateHwInfo(shader, encoding);
if (!result) {
delete[] tmpMem;
LogWarning("Error Creating program info");
return false;
}
addLoopConst(shader, encoding);
unsigned int outputCount = 0, condOut = 0, earlyExit = 0, globalCount = 0, persistentCount = 0;
unsigned int symbolCount = 0;
CALOutputEntry* outputs = 0;
unsigned int* globalBuffers = 0;
unsigned int* persistentBuffers = 0;
AMUabiUserSymbol* symbols = 0;
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
CALConstantBufferMask* constBuffers = encoding.constBuffers;
uint* inputResources = encoding.inputs;
CALUavEntry* uav = encoding.uav;
uint inputSamplerCount = samplerSize();
for (uint i = 0; i < inputSamplerCount; ++i) {
inputSamplers[i].resource = 0;
inputSamplers[i].sampler = sampler(i)->index_;
}
uint constBufferCount = 2;
constBuffers[0].index = 0;
constBuffers[1].index = 1;
uint inputResourceCount = 0;
uint uavCount = 0;
bool cbBound = false;
bool printfBound = false;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
constBuffers[constBufferCount++].index = arg->index_;
break;
case KernelArg::PointerGlobal:
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
break;
case KernelArg::ConstBufId:
if (!cbBound) {
uav[uavCount].offset = cbId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
cbBound = true;
break;
case KernelArg::PrintfBufId:
if (!printfBound) {
uav[uavCount].offset = printfId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
printfBound = true;
break;
case KernelArg::UavId:
if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
} else {
if (UavIdUndefined != uavRaw_) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
break;
case KernelArg::Sampler:
inputSamplers[inputSamplerCount].resource = 0;
inputSamplers[inputSamplerCount].sampler = arg->index_;
inputSamplerCount++;
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
if (arg->memory_.readOnly_) {
inputResources[inputResourceCount++] = arg->index_;
} else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
uav[uavCount].dimension = AMU_ABI_DIM_2D;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
default:
break;
}
}
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
}
encoding.machine = nullDev().hwInfo()->machine_;
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
encoding.inputCount = inputResourceCount;
encoding.outputCount = outputCount;
encoding.outputs = outputs;
encoding.condOut = condOut;
encoding.earlyExit = earlyExit;
encoding.globalBuffersCount = globalCount;
encoding.globalBuffers = globalBuffers;
encoding.persistentBuffersCount = persistentCount;
encoding.persistentBuffers = persistentBuffers;
encoding.constBuffersCount = constBufferCount;
encoding.inputSamplerMapCount = inputSamplerCount;
encoding.symbolsCount = symbolCount;
encoding.symbols = symbols;
encoding.uavCount = uavCount;
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
amuABIMultiBinaryDestroy(amuBinary);
delete[] tmpMem;
delete[] encoding.progInfos;
return (success == 0) ? false : true;
}
void Kernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const {
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if ((flags() & ImageEnable) &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && ((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)) {
// Use 8x8 workgroup size if kernel has image writes
if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
} else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
} else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
// Check if we couldn't find optimal workload
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch)) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
} else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
} else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
} else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
} else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
} else {
assert(0 && "Invalid workDim!");
}
}
}
} else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
void Kernel::setupProgramGrid(VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset,
const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize,
const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg,
const amd::NDRange& glbWorkSizeOrg) const {
// ABI is always in CB0
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* pGlobalSize =
reinterpret_cast<uint*>(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
uint* pLocalSize = reinterpret_cast<uint*>(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
uint* pNumGroups = reinterpret_cast<uint*>(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
uint* pGlobalOffset =
reinterpret_cast<uint*>(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
uint* pGroupOffset =
reinterpret_cast<uint*>(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
uint32_t* debugInfo = reinterpret_cast<uint*>(cbBuf + DebugOffset * ConstBuffer::VectorSize);
uint* pNDRangeGlobalOffset =
reinterpret_cast<uint*>(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
// Check for 64-bit metadata
uint glbABIShift = (abi64Bit()) ? 1 : 0;
VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_;
// Finds local workgroup size
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
// Initialize the execution grid block and size/offset
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
pGlobalSize[3] = static_cast<uint>(workDim);
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
pLocalSize[3] = 0;
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
pNumGroups[3] = 0;
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1;
progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1;
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 1;
bool partialGrid = false;
// Fill the right values, based on the application request
switch (workDim) {
case 3:
pLocalSize[2] = progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
pGlobalOffset[2] = glbWorkOffset[2];
pGroupOffset[2] = groupOffset[2];
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
if (progGrid->partialGridBlock.depth != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.depth++;
pNumGroups[2]++;
} else {
progGrid->partialGridBlock.depth = lclWorkSize[2];
}
// Fall through to fill 2D and 1D dimensions...
case 2:
pLocalSize[1] = progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
progGrid->gridSize.height /= progGrid->gridBlock.height;
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
pGlobalOffset[1] = glbWorkOffset[1];
pGroupOffset[1] = groupOffset[1];
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
if (progGrid->partialGridBlock.height != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.height++;
pNumGroups[1]++;
} else {
progGrid->partialGridBlock.height = lclWorkSize[1];
}
// Fall through to fill 1D dimension...
case 1:
pLocalSize[0] = progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
progGrid->gridSize.width /= progGrid->gridBlock.width;
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
pGlobalOffset[0] = glbWorkOffset[0];
pGroupOffset[0] = groupOffset[0];
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
if (progGrid->partialGridBlock.width != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.width++;
pNumGroups[0]++;
} else {
progGrid->partialGridBlock.width = lclWorkSize[0];
}
break;
default:
LogWarning("Wrong dimensions. Force to 1x1x1!");
break;
}
if (!partialGrid) {
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 0;
}
// Calculate the total number of workitems and workgroups
pGlobalOffset[3] = pGroupOffset[3] = 1;
for (uint i = 0; i < workDim; ++i) {
pGlobalOffset[3] *= pGlobalOffset[i];
pGroupOffset[3] *= pGroupOffset[i];
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
if (abi64Bit()) {
// Setup the debug info in constant buffer
reinterpret_cast<uint64_t*>(debugInfo)[1] = gpu.printfDbg().bufOffset();
// Size in DWORDs
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[4] /= sizeof(uint32_t);
} else {
// Setup the debug info in constant buffer
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
// Size in DWORDs
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[2] /= sizeof(uint32_t);
}
}
}
bool Kernel::initParameters() {
size_t offset = 0;
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
// Initialize the arguments for the abstraction layer
if (arg->isCbNeeded()) {
desc.name_ = arg->name_.data();
desc.type_ = arg->type();
desc.size_ = arg->size(false);
desc.addressQualifier_ = arg->addressQualifier();
desc.accessQualifier_ = arg->accessQualifier();
desc.typeName_ = arg->typeName();
desc.typeQualifier_ = arg->typeQualifier();
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
}
// Report the allocated local memory size (emulated and hw)
if (hwLocalSize_ != 0) {
CondLog((dev().info().localMemSize_ < hwLocalSize_),
"Requested local size is bigger than reported");
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
return false;
}
return true;
}
bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const {
bool result = true;
// Bind HW constant buffers used for the global data store
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (const auto& it : gds) {
uint idx = it.first;
result = bindResource(gpu, *(it.second), idx, ConstantBuffer, idx);
}
return result;
}
bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const {
bool result = true;
assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!");
// Upload the parameters to HW and bind all constant buffers
for (uint i = 0; i < numCb_; i++) {
ConstBuffer* cb = gpu.constBufs_[i];
result &= cb->uploadDataToHw(cbSizes_[i]) &&
bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset());
}
return result;
}
void Kernel::processMemObjects(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
bool nativeMem) const {
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
gpu.memoryDependency().newKernel();
// Check all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const KernelArg* arg = argument(i);
Memory* memory = NULL;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) &&
(arg->type_ != KernelArg::PointerHwLocal)) {
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
memory = reinterpret_cast<Memory* const*>(memories)[index];
} else if (*reinterpret_cast<amd::Memory* const*>(params + desc.offset_) != NULL) {
memory = dev().getGpuMemory(memories[index]);
// Synchronize data with other memory instances if necessary
memory->syncCacheFromHost(gpu);
}
if (memory != NULL) {
// Validate memory for a dependency in the queue
gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_);
}
}
}
}
bool Kernel::loadParameters(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
bool nativeMem) const {
bool result = true;
uint i;
// Initialize local private ranges
if (!initLocalPrivateRanges(gpu)) {
return false;
}
if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
// Bind a buffer for a dummy read
result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_);
}
// Find all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (i = 0; i != signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Set current argument
if (!setArgument(gpu, kernel, i, params, desc, nativeMem)) {
result = false;
break;
}
}
if (result) {
// Update the ring ranges and math constant
setLocalPrivateRanges(gpu);
result = bindConstantBuffers(gpu);
if (flags() & PrivateFixed) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
}
}
return result;
}
bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd,
bool pfpaDoppCmd) const {
const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal();
auto compProg = static_cast<gsl::ComputeProgramObject*>(gpu.gslKernelDesc()->func_);
compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu));
gpu.eventBegin(MainEngine);
gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock,
&dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_,
lastDoppCmd, pfpaDoppCmd);
gpu.flushCUCaches();
gpu.eventEnd(MainEngine, *calEvent);
// Unbind all resources
unbindResources(gpu, *calEvent, lastRun);
return true;
}
static size_t counter = 0;
void Kernel::debug(VirtualGPU& gpu) const {
std::fstream stubWrite;
address src = NULL;
std::cerr << "--- " << name_ << " ---" << std::endl;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
const Memory* gpuMem = gpu.slots_[i].memory_;
std::stringstream fileName;
bool bufferObj =
((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) ||
(arg->type_ == KernelArg::PointerHwConst));
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
std::cerr.setf(std::ios::hex);
uint* location =
reinterpret_cast<uint*>(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
}
if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) ||
((src == NULL) && bufferObj)) {
//@todo Replace the current map
Memory* resource = const_cast<Memory*>(gpu.slots_[i].memory_);
void* memory = resource->map(&gpu);
uint* location = reinterpret_cast<uint*>(memory);
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
resource->unmap(&gpu);
}
}
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory + gpu.cb(i)->wrtOffset()),
gpu.cb(i)->lastWrtSize());
gpu.constBufs_[i]->unmap(&gpu);
stubWrite.close();
}
}
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (const auto& it : gds) {
uint idx = it.first;
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(it.second->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory), it.second->size());
it.second->unmap(&gpu);
stubWrite.close();
}
}
}
bool Kernel::initConstBuffers() {
bool result = true;
size_t i;
assert((numCb_ != 0) && "We have 0 constant buffers!");
// Allocate an array for CB sizes
cbSizes_ = new size_t[numCb_];
if (cbSizes_ == NULL) {
return false;
}
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
// CB0 is reserved for ABI data
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
// Find sizes of all constant buffers
for (i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
size_t size = arg->cbPos_ + arg->size(true);
size_t specVec = arg->specialVector();
if (specVec != 0) {
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize;
}
// Do we need a CB?
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
cbSizes_[arg->cbIdx_] = size;
}
}
return result;
}
bool Kernel::setInternalSamplers(VirtualGPU& gpu) const {
for (uint i = 0; i < samplerSize(); ++i) {
const KernelArg* arg = sampler(i);
uint state = arg->cbPos_;
uint idx = arg->index_;
if (gpu.cal()->samplersState_[idx] != state) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
}
return true;
}
bool Kernel::setArgument(VirtualGPU& gpu, const amd::Kernel& kernel,
uint idx, const_address params,
const amd::KernelParameterDescriptor& desc,
bool nativeMem) const {
size_t size = desc.size_;
const void* param = params + desc.offset_;
bool result = true;
const KernelArg* arg;
address memory;
size_t argSize;
static const bool waitOnBusyEngine = true;
assert((idx < arguments_.size()) && "Param index is out of range!");
arg = argument(idx);
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
memory = gpu.cb(1)->sysMemCopy();
argSize = arg->size(true);
// Bind the global heap for emulation mode
switch (arg->type_) {
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
// Fall through ...
default:
break;
}
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
case KernelArg::PointerGlobal: {
gpu::Memory* gpuMem = NULL;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory*>(memories[index]);
} else if (memories[index] != nullptr) {
gpuMem = dev().getGpuMemory(memories[index]);
}
bool forceZeroOffset = false;
if (gpuMem == NULL) {
forceZeroOffset = true;
gpuMem = dev().getGpuMemory(dev().dummyPage());
}
uint64_t offset = gpuMem->pinOffset();
// Make sure the passed argument is a buffer object
if (!gpuMem->cal()->buffer_) {
LogError("The kernel buffer argument isn't a buffer object!");
return false;
}
if (arg->type_ == KernelArg::PointerHwConst) {
// Bind current memory object with the kernel
if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) {
return false;
}
assert((offset == 0) && "No offset for HW CB");
// Add a fake offset to make sure (ptr != NULL) is TRUE
offset = 1;
} else {
ResourceType type = ArgumentHeapBuffer;
// Check if kernel expects UAV binding
if (arg->memory_.uavBuf_) {
type = ArgumentBuffer;
} else {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
// Bind current memory object with the kernel
// Note: it's a fake binding, if the buffer is part of
// the global heap
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) {
return false;
}
// Update offset only if we bind HeapBuffer or
// it's global address space in UAV setup on SI+
offset += gpuMem->hbOffset();
if (!forceZeroOffset) {
assert((offset != 0) && "Offset 0 with a real allocation!");
}
gpu.addVmMemory(gpuMem);
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
if (forceZeroOffset) {
offset = 0;
}
// Copy memory offset into the constant buffer
if (abi64Bit()) {
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
} else {
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = static_cast<uint>(offset);
}
} break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA: {
gpu::Memory* gpuMem = NULL;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory*>(memories[index]);
} else if (memories[index] != nullptr) {
gpuMem = dev().getGpuMemory(memories[index]);
}
if (gpuMem == NULL) {
return false;
}
// Make sure the passed argument is an image object
if (gpuMem->cal()->buffer_) {
LogError("The kernel image argument isn't an image object!");
return false;
}
ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite;
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) {
return false;
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Copy image constants into the constant buffer
if (gpuMem->owner() != NULL) {
copyImageConstants(gpuMem->owner()->asImage(),
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
}
// Handle DOPP texture resource
gslMemObject gslMem = gpuMem->gslResource();
if (gslMem->getAttribs().isDOPPDesktopTexture) {
gpu.addVmMemory(gpuMem);
}
} break;
case KernelArg::Sampler: {
uint32_t index = desc.info_.arrayIndex_;
const amd::Sampler* amdSampler = reinterpret_cast<amd::Sampler* const*>(params +
kernel.parameters().samplerObjOffset())[index];
uint idx = arg->index_;
uint32_t state = amdSampler->state();
if (state != gpu.cal()->samplersState_[idx]) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
// Copy sampler state into the constant buffer
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
} break;
case KernelArg::Counter: {
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
} else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) {
return false;
}
} break;
case KernelArg::PointerHwLocal: {
// Calculate current offset in the local ring
uint offset = gpu.cal_.localSize;
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
offset = amd::alignUp(offset, arg->alignment_);
size_t memSize = *static_cast<const uintptr_t*>(param);
// Allocate new memory from the local ring
gpu.cal_.localSize += static_cast<uint>(memSize) + extra;
// Copy current local argument's offset into the CB
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
CondLog((gpu.cal_.localSize > dev().info().localMemSize_),
"Requested local size is bigger than reported!");
} break;
case KernelArg::Float:
case KernelArg::Double:
case KernelArg::Char:
case KernelArg::UChar:
case KernelArg::Short:
case KernelArg::UShort:
case KernelArg::Int:
case KernelArg::UInt:
case KernelArg::Long:
case KernelArg::ULong:
if (size != argSize) {
LogWarning("Parameter's sizes are unmatched!");
}
// Fall through ...
case KernelArg::Struct:
case KernelArg::Union: {
size_t specVec = arg->specialVector();
if (specVec != 0) {
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
for (uint i = 0; i < iter; ++i) {
amd::Os::fastMemcpy(
(memory + arg->cbPos_ + i * ConstBuffer::VectorSize),
reinterpret_cast<const char*>(param) + i * KernelArg::VectorSizeLimit * specVec,
specVec * KernelArg::VectorSizeLimit);
}
} else {
// Copy data into the CB
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
}
} break;
default:
LogError("Unhandled argument's type!");
break;
}
return result;
}
bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const {
// Initialize HW local
gpu.cal_.localSize = hwLocalSize_;
// Bind the global buffer if emulated local or private memory
// was allocated by the kernel
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) {
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
// Bind the global buffer if emulated constant buffers are enabled
if (cbId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
return false;
}
}
// Bind the printf buffer
if (printfId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
return false;
}
}
// Initialize the iterations count
gpu.cal_.iterations_ = 1;
return true;
}
void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const {
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* data;
uint gridSize =
gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth;
uint blockSize =
gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth;
//! \todo validate if the compiler still generates PrivateFixed
if (flags() & PrivateFixed) {
// Update private ring
data = reinterpret_cast<uint*>(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
if (abi64Bit()) {
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
data[2] = 0;
data[3] = 0;
} else {
data[0] = static_cast<uint>(gpuMemory->hbOffset());
data[1] = 0;
data[2] = data[3] = 0;
}
gpu.addVmMemory(gpuMemory);
}
// Copy the math lib constants
amd::Os::fastMemcpy((cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst,
sizeof(MathLibConst));
// Update the offset to the global data
if (prog().glbData() != NULL) {
gpu.addVmMemory(prog().glbData());
uint64_t glbDataOffset = prog().glbData()->hbOffset();
if (abi64Bit()) {
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
glbDataOffset;
} else {
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
static_cast<uint>(glbDataOffset);
}
}
// Split workload if it was requested
if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
uint totalSize = gridSize * blockSize;
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
gpu.cal_.iterations_ =
std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
}
}
// Initialize the number of iterations to the grid size
if (flags() & PrintfOutput) {
gpu.cal_.iterations_ = gridSize;
}
}
void Kernel::setSampler(VirtualGPU& gpu, uint32_t state, uint physUnit) const {
// All CAL sampler's parameters are in floats
float gslAddress = GSL_CLAMP_TO_BORDER;
float gslMinFilter = GSL_MIN_NEAREST;
float gslMagFilter = GSL_MAG_NEAREST;
state &= ~amd::Sampler::StateNormalizedCoordsMask;
// Program the sampler address mode
switch (state & amd::Sampler::StateAddressMask) {
case amd::Sampler::StateAddressRepeat:
gslAddress = GSL_REPEAT;
break;
case amd::Sampler::StateAddressClampToEdge:
gslAddress = GSL_CLAMP_TO_EDGE;
break;
case amd::Sampler::StateAddressMirroredRepeat:
gslAddress = GSL_MIRRORED_REPEAT;
break;
case amd::Sampler::StateAddressClamp:
case amd::Sampler::StateAddressNone:
default:
break;
}
state &= ~amd::Sampler::StateAddressMask;
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
// Program texture filter mode
if (state == amd::Sampler::StateFilterLinear) {
gslMinFilter = GSL_MIN_LINEAR;
gslMagFilter = GSL_MAG_LINEAR;
}
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
}
bool Kernel::bindResource(VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type,
uint physUnit, size_t offset) const {
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
// Find the original resource name from the IL program
switch (type) {
case GlobalBuffer:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavRaw_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentCbID:
if (gpu.state_.boundCb_) {
return true;
}
gpu.state_.boundCb_ = true;
physUnit = cbId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentPrintfID:
if (gpu.state_.boundPrintf_) {
return true;
}
gpu.state_.boundPrintf_ = true;
physUnit = printfId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentHeapBuffer:
case ArgumentBuffer:
case ArgumentImageRead:
case ArgumentImageWrite:
case ArgumentConstBuffer:
case ArgumentCounter:
// Early exit if resource is bound already
if (gpu.slots_[paramIdx].state_.bound_) {
return true;
}
// Associate resource with the slot
gpu.slots_[paramIdx].memory_ = &memory;
// Mark resource as bound
gpu.slots_[paramIdx].state_.bound_ = true;
if (type == ArgumentCounter) {
GpuEvent calEvent;
// Bind memory with atomic counter
gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource());
// Copy the counter value into GDS
gpu.eventBegin(MainEngine);
gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false);
gpu.eventEnd(MainEngine, calEvent);
// Mark resource as busy
memory.setBusy(gpu, calEvent);
return true;
} else if (type == ArgumentHeapBuffer) {
// We return here, since we just have to bind the global heap
return true;
} else if (type == ArgumentConstBuffer) {
gpu.slots_[paramIdx].state_.constant_ = true;
}
break;
case ArgumentUavID:
case ConstantBuffer:
break;
default:
LogPrintfError("Unspecified argument type ()!", type);
return false;
}
gslMemObject gslMem = NULL;
// Use global address space on SI+ for UAV setup
if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) ||
(type == ArgumentPrintfID)) {
gslMem = dev().heap().resource().gslResource();
} else {
gslMem = memory.gslResource();
}
// Associate memory with the physical unit, the actual binding
bool result = true;
switch (type) {
case GlobalBuffer:
case ArgumentBuffer:
case ArgumentImageWrite:
case ArgumentUavID:
case ArgumentCbID:
case ArgumentPrintfID:
if (type == ArgumentImageWrite) {
uavType = GSL_UAV_TYPE_TYPED;
} else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
uavType = GSL_UAV_TYPE_TYPELESS;
}
if (gpu.cal_.uavs_[physUnit] != gslMem) {
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
gpu.setUAVChannelOrder(physUnit, gslMem);
gpu.cal_.uavs_[physUnit] = gslMem;
}
break;
case ConstantBuffer:
case ArgumentConstBuffer:
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize());
gpu.cal_.constBuffers_[physUnit] = gslMem;
}
break;
case ArgumentImageRead:
if (gpu.cal_.readImages_[physUnit] != gslMem) {
result = gpu.setInput(physUnit, gslMem);
gpu.cal_.readImages_[physUnit] = gslMem;
}
break;
default:
result = false;
assert(false);
break;
}
if (!result) {
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
return false;
}
return true;
}
void Kernel::unbindResources(VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const {
// Make sure unbind occurs on the last run, in case the execution had a split
if (lastRun) {
for (uint i = 0; i < arguments_.size(); ++i) {
if (gpu.slots_[i].state_.bound_) {
GpuEvent calEventTmp = calEvent;
if (KernelArg::Counter == argument(i)->type_) {
// Copy the counter value from GDS
gpu.eventBegin(MainEngine);
gpu.cs()->syncAtomicCounter(argument(i)->index_, true);
gpu.eventEnd(MainEngine, calEventTmp);
} else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) {
// Signal the abstraction layer that GPU memory is dirty
if (gpu.slots_[i].memory_->owner() != NULL) {
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
}
}
// Mark resource as busy
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
gpu.slots_[i].state_.value_ = 0;
}
}
// Unbind the global buffer
gpu.state_.boundGlobal_ = false;
// Unbind the constant buffer
gpu.state_.boundCb_ = false;
// Unbind the pritnf buffer
gpu.state_.boundPrintf_ = false;
}
// Mark CB busy
for (uint i = 0; i < numCb_; ++i) {
gpu.constBufs_[i]->setBusy(gpu, calEvent);
}
// Set the event object for the scratch buffer
if (workGroupInfo()->scratchRegs_ > 0) {
dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent);
}
}
void Kernel::copyImageConstants(const amd::Image* amdImage, ImageConstants* imageData) const {
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
imageData->dataType_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
imageData->channelOrder_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
}
union MetadataVersion {
struct {
uint64_t revision_ : 16; //!< LLVM metadata revision
uint64_t minorVersion_ : 16; //!< LLVM metadata minor verison
uint64_t majorVersion_ : 16; //!< LLVM metadata major version
};
uint64_t value_;
MetadataVersion(uint mj, uint mi, uint rev) : value_(0) {
revision_ = rev;
minorVersion_ = mi;
majorVersion_ = mj;
}
MetadataVersion() : value_(0) {}
};
//! Version of metadata with buffer attributes
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
//! Version of metadata with type qualifiers
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) {
// Initialize workgroup info
workGroupInfo_.size_ = nullDev().info().preferredWorkGroupSize_;
MetadataVersion mdVersion;
// Find first tag
size_t pos = metaData.find(";");
// Loop through all provided program arguments
while (pos != std::string::npos) {
KernelArg arg;
if (!expect(metaData, &pos, ";")) {
break;
}
arg.type_ = KernelArg::NoType;
// Loop through all available metadata types
for (uint i = 0; i < ArgStateTotal; ++i) {
uint tmpValue;
// Find the name tag
if (expect(metaData, &pos, ArgState[i].typeName_)) {
switch (ArgState[i].type_) {
case KernelArg::NoType:
// Process next ...
continue;
case KernelArg::Reflection: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
LogWarning("Couldn't get the argument type!");
return false;
}
}
continue;
case KernelArg::ConstArg: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
}
continue;
case KernelArg::Grouping:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size!");
return false;
}
workGroupInfo_.compileSize_[j] = temp;
}
// Process next ...
continue;
case KernelArg::WrkgrpSize: {
uint temp;
// Read the workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the workgroup size!");
return false;
}
workGroupInfo_.size_ = temp;
}
// Process next ...
continue;
case KernelArg::Wavefront:
// Process next ...
continue;
case KernelArg::UavId:
// Read index
if (!getuint(metaData, &pos, &arg.index_)) {
return false;
}
break;
case KernelArg::ConstBufId:
// Read index
if (!getuint(metaData, &pos, &cbId_)) {
return false;
}
continue;
case KernelArg::PrintfBufId:
// Read index
if (!getuint(metaData, &pos, &printfId_)) {
return false;
}
continue;
case KernelArg::MetadataVersion:
// Read metadata version
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.majorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.minorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.revision_ = tmpValue;
// Process next ...
continue;
case KernelArg::GroupingHint:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size hint
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size hint!");
return false;
}
workGroupInfo_.compileSizeHint_[j] = temp;
}
// Process next ...
continue;
case KernelArg::VecTypeHint: {
std::string temp;
// Read the compile vector type hint
if (!getstring(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile vector type hint!");
return false;
}
workGroupInfo_.compileVecTypeHint_ = temp;
}
// Process next ...
continue;
case KernelArg::WavesPerSimdHint: {
uint tmp;
if (!getuint(metaData, &pos, &tmp)) {
return false;
}
workGroupInfo_.wavesPerSimdHint_ = tmp;
}
continue;
default:
break;
}
std::string argName;
// Save the argument type
arg.type_ = ArgState[i].type_;
// Check if we should expect the name
if (ArgState[i].name_) {
// Read the parameter's name
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.name_ = argName;
}
if (arg.type_ == KernelArg::Sampler) {
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.location_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the resource data type
if (ArgState[i].resType_) {
uint k;
// Search for the data type
for (k = 0; k < DataTypeTotal; k++) {
if (expect(metaData, &pos, DataType[k].tagName_)) {
arg.dataType_ = DataType[k].type_;
if (arg.type_ == KernelArg::Image) {
flags_ |= ImageEnable;
if (expect(metaData, &pos, "RO:")) {
arg.memory_.readOnly_ = 1;
} else if (expect(metaData, &pos, "RW:")) {
arg.memory_.readWrite_ = 1;
flags_ |= ImageWrite;
} else if (expect(metaData, &pos, "WO:")) {
arg.memory_.writeOnly_ = 1;
flags_ |= ImageWrite;
}
} else if (arg.type_ == KernelArg::Value) {
arg.type_ = DataType[k].type_;
}
break;
}
}
if (k == DataTypeTotal) {
LogWarning("We couldn't find the argument's type.");
if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
//! @todo temporary condition
if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) {
assert(false);
continue;
}
}
// Check if we should expect the data size
if (ArgState[i].size_) {
uint tmpData;
// Read the data size
if (!getuint(metaData, &pos, &tmpData)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.type_ == KernelArg::Image) {
arg.type_ = arg.dataType_;
arg.index_ = tmpData;
} else {
arg.size_ = tmpData;
}
}
if (arg.type_ == KernelArg::Counter) {
// Read a counter index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a counter index!");
return false;
}
}
// Check if we should expect a resource index
if (ArgState[i].cbIdx_) {
// Read resource index
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
numCb_ = arg.cbIdx_;
}
}
// Check if we should expect the CB offset
if (ArgState[i].cbPos_) {
// Read position in the constant buffer
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the buffer type
if (ArgState[i].buf_) {
// Read the buffer type
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.buf_ = argName;
for (uint k = 0; k < BufTypeTotal; ++k) {
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
// Update the parameter type
arg.type_ = BufType[k].type_;
// Check if we should expect a buffer index
if (BufType[k].number_) {
// Read a buffer index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the required alignment
if (BufType[k].alignment_) {
// Read data alignment
if (!getuint(metaData, &pos, &arg.alignment_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the buffer's attribute
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) {
if (expect(metaData, &pos, "RO")) {
arg.memory_.readOnly_ = 1;
} else if (expect(metaData, &pos, "RW")) {
arg.memory_.readWrite_ = 1;
} else if (expect(metaData, &pos, "WO")) {
arg.memory_.writeOnly_ = 1;
}
}
// Check for the type qualifier
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) {
uint tmp;
pos += 1;
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get volatile type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get restrict type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
}
}
}
}
// Find multiple UAV references
switch (arg.type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
case KernelArg::UavId:
uavRefCount[arg.index_]++;
break;
default:
break;
}
// Check if this argument will be passed in constant buffer
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
if (arg.type_ == KernelArg::Sampler) {
// Serach for the passed by value sampler
for (uint i = 0; i < argSize(); ++i) {
KernelArg* value = arguments_[i];
if (0 == value->name_.compare(arg.name_)) {
value->type_ = arg.type_;
value->index_ = arg.index_;
value->location_ = 0;
break;
}
}
} else {
KernelArg* argument = new KernelArg(arg);
if (argument != NULL) {
addArgument(argument);
} else {
LogError("Couldn't allocate memory!");
return false;
}
}
}
// Check if we have a pre-defined sampler
else if (arg.type_ == KernelArg::Sampler) {
KernelArg* sampler = new KernelArg(arg);
if (sampler != NULL) {
addSampler(sampler);
} else {
LogError("Couldn't allocate memory!");
return false;
}
}
break;
}
}
// Next argument
pos = metaData.find(";", pos);
}
// Find arguments that will require a reallocation
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
// Check if can't use a dedicated UAV,
// so realloc memory in the heap
arg->memory_.realloc_ = false;
arg->memory_.uavBuf_ = true;
break;
case KernelArg::PointerHwConst:
arg->memory_.realloc_ = true;
break;
case KernelArg::UavId:
uavRaw_ = arg->index_;
break;
default:
break;
}
// If argument marked with the const qualifier, then overwrite
// Read-Write attributes, since compiler doesn't mark it properly
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
arg->memory_.readOnly_ = 1;
arg->memory_.readWrite_ = 0;
arg->memory_.writeOnly_ = 0;
}
}
if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) {
// Find if default UAV is already assigned to an argument
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (uavRaw_ == arg->index_) {
uavRaw_ = UavIdUndefined;
}
break;
default:
break;
}
}
}
// There is always 1 constant buffer, associated with the kernel
numCb_++;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more than max CBs for arguments!");
// Limit workgroup size if requested
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
size_t temp = 1;
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
for (uint j = 0; j < 3; ++j) {
if (workGroupInfo()->compileSize_[j] != 0) {
temp *= workGroupInfo_.compileSize_[j];
}
}
// Report a compilation error if requested compile size doesn't
// match the required workgroup size
if (workGroupInfo()->size_ < temp) {
char str[8];
intToStr(workGroupInfo_.size_, str, 8);
buildError_ = CL_OUT_OF_RESOURCES;
buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of ";
buildLog_ += str;
buildLog_ += " elements\n";
LogError(buildLog().c_str());
return false;
}
}
// Read/Write attributes are provided in metadata
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
rwAttributes_ = true;
}
return true;
}
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return HSAIL_ARGTYPE_POINTER;
case ARG_TYPE_QUEUE:
return HSAIL_ARGTYPE_QUEUE;
case ARG_TYPE_VALUE:
return HSAIL_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return HSAIL_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
return HSAIL_ARGTYPE_SAMPLER;
case ARG_TYPE_ERROR:
default:
return HSAIL_ARGTYPE_ERROR;
}
}
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return argInfo->arg.pointer.align;
default:
return 1;
}
}
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.type) {
case ACCESS_TYPE_RO:
return HSAIL_ACCESS_TYPE_RO;
case ACCESS_TYPE_WO:
return HSAIL_ACCESS_TYPE_WO;
case ACCESS_TYPE_RW:
default:
return HSAIL_ACCESS_TYPE_RW;
}
}
return HSAIL_ACCESS_TYPE_NONE;
}
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT_EMU:
case PTR_MT_CONSTANT:
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return HSAIL_ADDRESS_LOCAL;
case PTR_MT_SCRATCH_EMU:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_ERROR:
default:
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
}
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
return HSAIL_ADDRESS_GLOBAL;
} else if (argInfo->type == ARG_TYPE_QUEUE) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
aclArgDataType dataType;
if (argInfo->type == ARG_TYPE_POINTER) {
dataType = argInfo->arg.pointer.data;
} else if (argInfo->type == ARG_TYPE_VALUE) {
dataType = argInfo->arg.value.data;
} else {
return HSAIL_DATATYPE_ERROR;
}
switch (dataType) {
case DATATYPE_i1:
return HSAIL_DATATYPE_B1;
case DATATYPE_i8:
return HSAIL_DATATYPE_S8;
case DATATYPE_i16:
return HSAIL_DATATYPE_S16;
case DATATYPE_i32:
return HSAIL_DATATYPE_S32;
case DATATYPE_i64:
return HSAIL_DATATYPE_S64;
case DATATYPE_u8:
return HSAIL_DATATYPE_U8;
case DATATYPE_u16:
return HSAIL_DATATYPE_U16;
case DATATYPE_u32:
return HSAIL_DATATYPE_U32;
case DATATYPE_u64:
return HSAIL_DATATYPE_U64;
case DATATYPE_f16:
return HSAIL_DATATYPE_F32;
case DATATYPE_f32:
return HSAIL_DATATYPE_F32;
case DATATYPE_f64:
return HSAIL_DATATYPE_F64;
case DATATYPE_struct:
return HSAIL_DATATYPE_STRUCT;
case DATATYPE_opaque:
return HSAIL_DATATYPE_OPAQUE;
case DATATYPE_ERROR:
default:
return HSAIL_DATATYPE_ERROR;
}
}
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_VALUE:
switch (GetHSAILDataType(argInfo)) {
case HSAIL_DATATYPE_B1:
return 1;
case HSAIL_DATATYPE_B8:
case HSAIL_DATATYPE_S8:
case HSAIL_DATATYPE_U8:
return 1;
case HSAIL_DATATYPE_B16:
case HSAIL_DATATYPE_U16:
case HSAIL_DATATYPE_S16:
case HSAIL_DATATYPE_F16:
return 2;
case HSAIL_DATATYPE_B32:
case HSAIL_DATATYPE_U32:
case HSAIL_DATATYPE_S32:
case HSAIL_DATATYPE_F32:
return 4;
case HSAIL_DATATYPE_B64:
case HSAIL_DATATYPE_U64:
case HSAIL_DATATYPE_S64:
case HSAIL_DATATYPE_F64:
return 8;
case HSAIL_DATATYPE_STRUCT:
return argInfo->arg.value.numElements;
default:
return -1;
}
case ARG_TYPE_POINTER:
case ARG_TYPE_IMAGE:
case ARG_TYPE_SAMPLER:
case ARG_TYPE_QUEUE:
return sizeof(void*);
default:
return -1;
}
}
inline static clk_value_type_t GetOclType(const aclArgData* argInfo) {
static const clk_value_type_t ClkValueMapType[6][6] = {
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
};
uint sizeType;
if (argInfo->type == ARG_TYPE_QUEUE) {
return T_QUEUE;
}
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
return T_POINTER;
} else if (argInfo->type == ARG_TYPE_VALUE) {
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
sizeType = 0;
break;
case DATATYPE_i16:
case DATATYPE_u16:
sizeType = 1;
break;
case DATATYPE_i32:
case DATATYPE_u32:
sizeType = 2;
break;
case DATATYPE_i64:
case DATATYPE_u64:
sizeType = 3;
break;
case DATATYPE_f16:
case DATATYPE_f32:
sizeType = 4;
break;
case DATATYPE_f64:
sizeType = 5;
break;
default:
return T_VOID;
}
switch (argInfo->arg.value.numElements) {
case 1:
return ClkValueMapType[sizeType][0];
case 2:
return ClkValueMapType[sizeType][1];
case 3:
return ClkValueMapType[sizeType][2];
case 4:
return ClkValueMapType[sizeType][3];
case 8:
return ClkValueMapType[sizeType][4];
case 16:
return ClkValueMapType[sizeType][5];
default:
return T_VOID;
}
} else if (argInfo->type == ARG_TYPE_SAMPLER) {
return T_SAMPLER;
} else {
return T_VOID;
}
}
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
} else if (argInfo->type == ARG_TYPE_IMAGE) {
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
}
// default for all other cases
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_IMAGE) {
switch (argInfo->arg.image.type) {
case ACCESS_TYPE_RO:
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
case ACCESS_TYPE_WO:
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
case ACCESS_TYPE_RW:
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
return CL_KERNEL_ARG_ACCESS_NONE;
}
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (argInfo->type == ARG_TYPE_POINTER) {
if (argInfo->arg.pointer.isVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (argInfo->arg.pointer.isRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
if (argInfo->arg.pointer.isPipe) {
rv |= CL_KERNEL_ARG_TYPE_PIPE;
}
if (argInfo->isConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
}
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
rv |= CL_KERNEL_ARG_TYPE_CONST;
break;
default:
break;
}
}
return rv;
}
static int GetOclSize(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return sizeof(void*);
case ARG_TYPE_VALUE:
//! \note OCL 6.1.5. For 3-component vector data types,
//! the size of the data type is 4 * sizeof(component).
switch (argInfo->arg.value.data) {
case DATATYPE_struct:
return 1 * argInfo->arg.value.numElements;
case DATATYPE_i8:
case DATATYPE_u8:
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_ERROR:
default:
return -1;
}
case ARG_TYPE_IMAGE:
return sizeof(cl_mem);
case ARG_TYPE_SAMPLER:
return sizeof(cl_sampler);
case ARG_TYPE_QUEUE:
return sizeof(cl_command_queue);
default:
return -1;
}
}
void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Initialize the hsail argument list too
initHsailArgs(aclArg);
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
// Reserved arguments for HSAIL launch
aclArg += MaxExtraArgumentsNum;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(aclArg);
desc.addressQualifier_ = GetOclAddrQual(aclArg);
desc.accessQualifier_ = GetOclAccessQual(aclArg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = sizeof(cl_mem);
} else {
desc.size_ = GetOclSize(aclArg);
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
flags_.imageEna_ = true;
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
flags_.imageWriteEna_ = true;
}
}
}
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
int offset = 0;
// Reserved arguments for HSAIL launch
aclArg += MaxExtraArgumentsNum;
// Iterate through the each kernel argument
for (; aclArg->struct_size != 0; aclArg++) {
Argument* arg = new Argument;
// Initialize HSAIL kernel argument
arg->name_ = aclArg->argStr;
arg->typeName_ = aclArg->typeStr;
arg->size_ = GetHSAILArgSize(aclArg);
arg->offset_ = offset;
arg->type_ = GetHSAILArgType(aclArg);
arg->addrQual_ = GetHSAILAddrQual(aclArg);
arg->dataType_ = GetHSAILDataType(aclArg);
// If vector of args we add additional arguments to flatten it out
arg->numElem_ =
((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct))
? aclArg->arg.value.numElements
: 1;
arg->alignment_ = GetHSAILArgAlignment(aclArg);
arg->access_ = GetHSAILArgAccessType(aclArg);
offset += GetHSAILArgSize(aclArg);
arguments_.push_back(arg);
}
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
uint extraArgsNum)
: device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
index_(0),
code_(NULL),
codeSize_(0),
hwMetaData_(NULL),
extraArgumentsNum_(extraArgsNum) {
flags_.hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
while (!arguments_.empty()) {
Argument* arg = arguments_.back();
delete arg;
arguments_.pop_back();
}
delete[] hwMetaData_;
delete code_;
}
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
return false;
}
acl_error error = ACL_SUCCESS;
std::string openClKernelName = openclMangledName(name());
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
// compile kernel down to ISA
if (finalize) {
std::string options(compileOptions_.c_str());
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (dev().settings().svmFineGrainSystem_) {
options.append(" -sc-xnack-iommu");
}
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
if (error != ACL_SUCCESS) {
LogError("Failed to finalize kernel");
return false;
}
}
aqlCreateHWInfo(sym);
// Pull out metadata from the ELF
size_t sizeOfArgList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), NULL, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
char* aclArgList = new char[sizeOfArgList];
if (NULL == aclArgList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
size_t sizeOfWorkGroupSize;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
} else {
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), NULL, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Make sure kernel has any printf info
if (0 != sizeOfPrintfList) {
char* aclPrintfList = new char[sizeOfPrintfList];
if (NULL == aclPrintfList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the PrintfList
InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
delete[] aclPrintfList;
}
aclMetadata md;
md.enqueue_kernel = false;
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
if (error != ACL_SUCCESS) {
return false;
}
flags_.dynamicParallelism_ = md.enqueue_kernel;
md.kernel_index = -1;
size_t sizeOfIndex = sizeof(md.kernel_index);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX,
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
if (error != ACL_SUCCESS) {
return false;
}
index_ = md.kernel_index;
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
&sizeOfWavesPerSimdHint);
if (error != ACL_SUCCESS) {
return false;
}
waveLimiter_.enable(dev().settings().ciPlus_);
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
&sizeOfWorkGroupSizeHint);
if (error != ACL_SUCCESS) {
return false;
}
size_t sizeOfVecTypeHint;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
if (0 != sizeOfVecTypeHint) {
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
if (NULL == VecTypeHint) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
VecTypeHint[sizeOfVecTypeHint] = '\0';
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
delete[] VecTypeHint;
}
// Set the argList
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
delete[] aclArgList;
return true;
}
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast<const HSAILProgram&>(prog_);
}
inline static void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const void* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment = 0 //!< The alignment to follow while writing to the buffer
) {
if (alignment == 0) {
*dst = amd::alignUp(*dst, size);
} else {
*dst = amd::alignUp(*dst, alignment);
}
memcpy(*dst, src, size);
*dst += size;
}
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap,
std::vector<const Memory*>& memList) const {
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
if (extraArgumentsNum_ > 0) {
assert(MaxExtraArgumentsNum >= 6 &&
"MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
size_t extraArgs[MaxExtraArgumentsNum] = {0, 0, 0, 0, 0, 0};
// The HLC generates up to 3 additional arguments for the global offsets
for (uint i = 0; i < sizes.dimensions(); ++i) {
extraArgs[i] = sizes.offset()[i];
}
// Check if the kernel may have printf output
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
// and set the fourth argument as the printf_buffer pointer
extraArgs[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
}
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_BUSY;
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(sizeof(AmdAqlWrap));
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
// and set 5th & 6th arguments
extraArgs[4] = vmDefQueue;
extraArgs[5] = *vmParentWrap;
memList.push_back(cb);
}
WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t) * extraArgumentsNum_, sizeof(size_t));
}
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
// Find all parameters for the current kernel
for (uint i = 0; i != signature.numParameters(); ++i) {
const HSAILKernel::Argument* arg = argument(i);
const amd::KernelParameterDescriptor& desc = signature.at(i);
const_address paramaddr = parameters + desc.offset_;
switch (arg->type_) {
case HSAIL_ARGTYPE_POINTER:
// If it is a global pointer
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
Memory* gpuMem = NULL;
amd::Memory* mem = NULL;
uint32_t index = signature.at(i).info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
if (nullptr != gpuMem) {
mem = gpuMem->owner();
}
} else {
mem = memories[index];
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
}
}
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
if (gpuMem == nullptr) {
break;
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, WaitOnBusyEngine);
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
// save the memory object pointer to allow global memory access
if (NULL != dev().hwDebugMgr()) {
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
}
}
// If it is a local pointer
else {
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type");
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, desc.size_);
if (desc.size_ == 8) {
ldsAddress += *reinterpret_cast<const uint64_t*>(paramaddr);
} else {
ldsAddress += *reinterpret_cast<const uint32_t*>(paramaddr);
}
}
break;
case HSAIL_ARGTYPE_VALUE:
// Special case for structrues
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
// Copy the current structre into CB1
memcpy(aqlStruct, paramaddr, arg->size_);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(arg->size_);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
memList.push_back(cb);
} else {
WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_);
}
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = signature.at(i).info_.arrayIndex_;
if (nativeMem) {
image = reinterpret_cast<Image* const*>(memories)[index];
if (nullptr != image) {
mem = image->owner();
}
} else {
mem = memories[index];
if (mem == NULL) {
LogError("The kernel image argument isn't an image object!");
return nullptr;
}
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
} else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(image);
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
uint32_t index = signature.at(i).info_.arrayIndex_;
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
kernelParams.samplerObjOffset())[index];
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
uint32_t index = signature.at(i).info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
parameters + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!gpu.createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return nullptr;
}
vmQueue = gpu.vQueue()->vmAddress();
}
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
default:
LogError(" Unsupported address type ");
return NULL;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return NULL;
}
// HSAIL kernarg segment size is rounded up to multiple of 16.
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlArgBuf);
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
hsaDisp->workgroup_size_x = local[0];
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
hsaDisp->grid_size_x = global[0];
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
hsaDisp->reserved2 = 0;
// Initialize kernel ISA and execution buffer requirements
hsaDisp->private_segment_size = spillSegSize();
hsaDisp->group_segment_size = ldsAddress;
hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
hsaDisp->reserved2 = 0;
hsaDisp->completion_signal.handle = 0;
memList.push_back(cb);
memList.push_back(gpuAqlCode());
for (gpu::Memory* mem : prog().globalStores()) {
memList.push_back(mem);
}
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
memList.push_back(gpu.hsaQueueMem());
}
if (srdResource || prog().isStaticSampler()) {
dev().srds().fillResourceList(memList);
}
return hsaDisp;
}
} // namespace gpu