Files
rocm-systems/rocclr/runtime/device/gpu/gpukernel.cpp
T
foreman e87e2d4c11 P4 to Git Change 1057460 by gandryey@gera-dev-w7 on 2014/07/21 14:56:59
ECR #304775 - Device enqueuing
	- Report proper size for the device queue.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#259 edit
2014-07-21 15:06:08 -04:00

4054 строки
137 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"
#include "acl.h"
#include "SCShadersR678XXCommon.h"
#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
namespace gpu {
const MetaDataConst ArgState[ArgStateTotal] =
{
// Note: the order is important
// Name Type Properties
// Kernel description (special properties)
{ "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } },
{ "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } },
{ "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } },
{ "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } },
{ "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } },
{ "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } },
{ "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } },
{ "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } },
{ "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } },
// Kernel basic types
{ "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } },
{ "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } },
{ "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } },
{ "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } },
{ "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } },
{ "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } },
{ "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } },
{ "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } },
{ "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } },
{ "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } },
};
const DataTypeConst DataType[] =
{
{ "i8:", KernelArg::Char, },
{ "i16:", KernelArg::Short, },
{ "i32:", KernelArg::Int, },
{ "i64:", KernelArg::Long, },
{ "u8:", KernelArg::UChar, },
{ "u16:", KernelArg::UShort, },
{ "u32:", KernelArg::UInt, },
{ "u64:", KernelArg::ULong, },
{ "float:", KernelArg::Float, },
{ "double:", KernelArg::Double, },
{ "struct:", KernelArg::Struct, },
{ "union:", KernelArg::Union, },
{ "1D:", KernelArg::Image1D, },
{ "2D:", KernelArg::Image2D, },
{ "3D:", KernelArg::Image3D, },
{ "1DB:", KernelArg::Image1DB, },
{ "1DA:", KernelArg::Image1DA, },
{ "2DA:", KernelArg::Image2DA, },
{ "opaque:", KernelArg::Opaque, },
{ "event:", KernelArg::Event, },
{ "sampler:", KernelArg::Sampler, },
{ "half:", KernelArg::Half, },
};
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
struct BufDataConst
{
const char* tagName_; //!< buffer's name
KernelArg::ArgumentType type_; //!< type of argument
struct
{
uint number_ : 1; //!< buffer's number
uint alignment_ : 1; //!< buffer's alignment
uint attribute_ : 1; //!< buffer's read/write attribute
uint reserved : 29; //!< reserved
};
};
static const BufDataConst BufType[] =
{
{ "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } },
{ "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } },
{ "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } },
{ "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } },
{ "c", KernelArg::PointerConst, { 1, 1, 1, 0 } },
{ "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } },
{ "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } },
{ "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } }
};
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
//! The mathlib constants for each kernel execution
static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f };
bool
expect(const std::string& str, size_t* pos, const std::string& sym)
{
bool result = true;
uint i;
if (*pos == std::string::npos) {
return false;
}
// Check if we have expected symbols
for (i = 0; i < sym.size(); ++i) {
char deb = str[*pos + i];
if (deb != sym[i]) {
result = false;
break;
}
}
if (result) *pos += i;
return result;
}
bool
getword(const std::string& str, size_t* pos, char* sym)
{
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;", *pos);
size_t count = posEnd - *pos;
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*pos = posEnd + 1;
return true;
}
bool
getstring(const std::string& str, size_t* pos, std::string* out)
{
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(":\n\r;", *pos);
size_t count = posEnd - *pos;
char* sym = new char[count + 1];
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*out = sym;
delete [] sym;
*pos = posEnd + 1;
return true;
}
bool
getuint(const std::string& str, size_t* pos, uint* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
*val = (*val * 10) + (sym[i] - 0x30);
}
*pos = posEnd + 1;
return true;
}
bool
getuintHex(const std::string& str, size_t* pos, uint* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
}
else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
}
else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
bool
getuint64Hex(const std::string& str, size_t* pos, uint64_t* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
}
else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
}
else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
void
intToStr(size_t value, char* str, size_t size)
{
static const uint MaxDigits32bit = 10;
char result[MaxDigits32bit];
uint idx = MaxDigits32bit;
do {
idx--;
result[idx] = static_cast<char>((value % 10) + '0');
value /= 10;
} while ((value != 0) && (idx > 0));
size_t len = MaxDigits32bit - idx;
size_t n = std::min<size_t>(len, size-1);
memcpy(str, &result[idx], n);
str[n] = '\0';
}
//! Default destructor
CalImageReference::~CalImageReference()
{
// Free CAL image
free(image_);
}
KernelArg::KernelArg()
: type_(KernelArg::None)
, size_(0)
, cbIdx_(0)
, cbPos_(0)
, index_(0)
, alignment_(1)
, dataType_(KernelArg::None)
{
name_ = "";
buf_ = "";
memory_.value_ = 0;
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
}
KernelArg::KernelArg(const KernelArg& data)
{
// Fill the new object
*this = data;
}
KernelArg&
KernelArg::operator=(const KernelArg& data)
{
// Fill the fields of the current object
name_ = data.name_;
typeName_ = data.typeName_;
typeQualifier_ = data.typeQualifier_;
type_ = data.type_;
size_ = data.size_;
cbIdx_ = data.cbIdx_;
cbPos_ = data.cbPos_;
buf_ = data.buf_;
index_ = data.index_;
alignment_ = data.alignment_;
dataType_ = data.dataType_;
memory_.value_ = data.memory_.value_;
return *this;
}
bool
KernelArg::isCbNeeded() const
{
//! \note not a safe way
bool result = ((type_ > None) && (type_ < Sampler)) ? true : false;
if ((type_ == Sampler) && (location_ == 0)) {
// Sampler is defined outside the kernel
result = true;
}
return result;
}
size_t
KernelArg::size(bool gpuLayer)const
{
switch (type_) {
case None:
return 0;
case PointerConst:
case PointerHwConst:
case PointerGlobal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
case Sampler:
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
case Counter:
return (gpuLayer) ? 0 : sizeof(cl_mem);
case PointerLocal:
case PointerHwLocal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
case PointerPrivate:
case PointerHwPrivate:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
case Float:
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
case Double:
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
case Char:
case UChar:
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
case Short:
case UShort:
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
case Int:
case UInt:
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
case Long:
case ULong:
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
case Struct:
case Union:
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
default:
return 0;
}
}
cl_kernel_arg_address_qualifier
KernelArg::addressQualifier() const
{
switch (type_) {
case PointerGlobal:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PointerLocal:
case PointerHwLocal:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
case PointerConst:
case PointerHwConst:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
}
cl_kernel_arg_access_qualifier
KernelArg::accessQualifier() const
{
switch (type_) {
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
if (memory_.readOnly_) {
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
}
else if (memory_.writeOnly_) {
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
}
else if (memory_.readWrite_) {
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
}
// Fall through ...
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
//! temporary solution for the vectors handling in compiler
size_t
KernelArg::specialVector() const
{
if (size_ > VectorSizeLimit) {
switch (type_) {
case Char:
case UChar:
return sizeof(cl_char);
case Short:
case UShort:
return sizeof(cl_short);
default:
return 0;
}
}
return 0;
}
clk_value_type_t
KernelArg::type()const
{
switch (type_) {
case PointerGlobal:
case PointerLocal:
case PointerHwLocal:
case PointerConst:
case PointerHwConst:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
case Counter:
return T_POINTER;
case Float:
return T_FLOAT;
case Double:
return T_DOUBLE;
case Char:
case UChar:
return T_CHAR;
case Short:
case UShort:
return T_SHORT;
case Int:
return T_INT;
case UInt:
//! \note No UINT type
return T_INT;
case Long:
return T_LONG;
case ULong:
//! \note No ULONG type
return T_LONG;
case Struct:
case Union:
//! @todo What should we report?
return T_CHAR;
case Sampler:
return T_SAMPLER;
case PointerPrivate:
case PointerHwPrivate:
case None:
default:
return T_VOID;
}
}
NullKernel::NullKernel(
const std::string& name,
const NullDevice& gpuNullDev,
const NullProgram& nullprog)
: device::Kernel(name)
, buildError_(CL_BUILD_PROGRAM_FAILURE)
, gpuDev_(gpuNullDev)
, prog_(nullprog)
, calRef_(NULL)
, internal_(false)
, flags_(0)
, cbSizes_(NULL)
, numCb_(0)
, rwAttributes_(false)
, instructionCnt_(4)
{
// UAV raw index will be detected
uavRaw_ = UavIdUndefined;
// Initialize UAV arena index(should be 8)
uavArena_ = VirtualGPU::UavArena;
// CB index will be detected
cbId_ = UavIdUndefined;
// Printf index will be detected
printfId_ = UavIdUndefined;
}
NullKernel::~NullKernel()
{
uint idx;
if (calRef_ == NULL) {
return;
}
calRef_->release();
// Destroy all kernel arguments
for (idx = 0; idx < arguments_.size(); ++idx) {
delete arguments_[idx];
}
arguments_.clear();
// Destroy all sampler kernel arguments
for (idx = 0; idx < intSamplers_.size(); ++idx) {
delete intSamplers_[idx];
}
intSamplers_.clear();
}
static int
scComponentToArrayIndex(E_SC_COMPONENT dstComp)
{
switch (dstComp) {
case SC_COMPONENT_X:
return 0;
case SC_COMPONENT_Y:
return 1;
case SC_COMPONENT_Z:
return 2;
case SC_COMPONENT_W:
return 3;
}
return 0;
}
static void
addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding)
{
uint count = shader->dep.NumIntrlIConstants;
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
// only suppport loop consts (int consts)
if (count) {
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
uint usedConsts = 0;
for (uint i = 0; i < count; ++i) {
uint currentConst;
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
if (allocatedconsts[currentConst].addr ==
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
break;
}
}
if (currentConst == usedConsts) {
usedConsts++;
assert(usedConsts <= count);
}
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
allocatedconsts[currentConst].type = AMU_ABI_INT32;
allocatedconsts[currentConst].value.
int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
}
encoding.litConstsCount = usedConsts;
}
}
bool
NullKernel::create(
const std::string& code,
const std::string& metadata,
const void* binaryCode,
size_t binarySize)
{
std::auto_ptr<uint> uavRefCount (new uint[MaxUavArguments]);
if (NULL == uavRefCount.get()) {
return false;
}
// Set all ref counts to 0
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
// parse the metadata fields
if (!parseArguments(metadata, uavRefCount.get())) {
return false;
}
CALimage calImage;
// Save source if DEBUG build
#if DEBUG
ilSource_ = code;
#endif // DEBUG
amd::option::Options *options = nullProg().getCompilerOptions();
internal_ = options->oVariables->clInternalKernel;
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
acl_error err;
std::string arch = GPU_TARGET_INFO_ARCH;
if (nullDev().settings().use64BitPtr_) {
arch += "64";
}
aclTargetInfo info = aclGetTargetInfo(
arch.c_str(), nullDev().hwInfo()->targetName_, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclGetTargetInfo failed");
return false;
}
aclBinaryOptions binOpts = {0};
binOpts.struct_size = sizeof(binOpts);
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
binOpts.bitness = ELFDATA2LSB;
binOpts.alloc = &::malloc;
binOpts.dealloc = &::free;
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclBinaryInit failed");
return false;
}
if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin,
code.data(), code.size(), aclSOURCE)) {
LogWarning("aclInsertSection failed");
aclBinaryFini(bin);
return false;
}
amd::option::Options* Opts = (amd::option::Options*)bin->options;
// temporary solution to synchronize buildNo between runtime and complib
// until we move runtime inside complib
Opts->setBuildNo(options->getBuildNo());
// pass kernel name to compiler
Opts->setCurrKernelName(name().c_str());
err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(),
ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(nullDev().compiler());
if (err != ACL_SUCCESS) {
LogWarning("aclCompile failed");
aclBinaryFini(bin);
return false;
}
if (!options->oVariables->BinEXE) {
// Early exit if binary doesn't contain EXE
aclBinaryFini(bin);
return true;
}
size_t len;
const void* isa = aclExtractSection(nullDev().compiler(), bin,
&len, aclTEXT, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclExtractSection failed");
aclBinaryFini(bin);
return false;
}
uint calImageSize;
if (!createMultiBinary(
&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
LogWarning("initSrcEncoding failed");
aclBinaryFini(bin);
return false;
}
aclBinaryFini(bin);
}
else if ((binaryCode != NULL) && (binarySize != 0)) {
uint size = 0;
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode))
|| size > binarySize) {
buildLog_ += "Invalid binary image";
LogError("amuABIMultiBinaryGetSize failed!");
return false;
}
calImage = static_cast<CALimage>(malloc(size));
::memcpy(calImage, binaryCode, size);
}
else {
LogError("Incorrect initialization parameters!");
return false;
}
calRef_ = new CalImageReference(calImage);
if (calRef_ == NULL) {
LogError("Memory allocation failure!");
// Free CAL image
free(calImage);
return false;
}
CALfuncInfo calFuncInfo;
// Get kernel compiled information
getFuncInfoFromImage(calImage, &calFuncInfo);
if (calFuncInfo.maxScratchRegsNeeded > 0) {
LogPrintfInfo("%s kernel has register spilling."
"Lower performance is expected.", name().c_str());
}
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
if (!createSignature(params)) {
return false;
}
return true;
}
size_t
NullKernel::getCalBinarySize() const
{
CALuint imageSize;
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
LogError("Failed to get the image size!");
return 0;
}
return static_cast<size_t>(imageSize);
}
bool
NullKernel::getCalBinary(void* binary, size_t size) const
{
uint calImageSize = 0;
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage())
|| size < calImageSize) {
LogError("CAL failed to save the kernel binary!");
return false;
}
::memcpy(binary, calImage(), calImageSize);
return true;
}
bool
Kernel::create(
const std::string& code,
const std::string& metadata,
const void* binaryCode,
size_t binarySize)
{
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
return false;
}
// initialize constant buffer sizes
if (!initConstBuffers()) {
return false;
}
// Initialize the kernel parameters
bool result = initParameters();
if (!dev().heap()->isVirtual()) {
amd::option::Options *options = nullProg().getCompilerOptions();
// @todo Remove this. This is a hack for no VM mode
if (!options->oVariables->EnableDumpKernel) {
if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
!name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
blitKernelHack_ = true;
}
}
}
if (result) {
buildError_ = CL_SUCCESS;
}
else {
result = false;
}
return result;
}
Kernel::Kernel(
const std::string& name,
const Device& gpuDev,
const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog)
, blitKernelHack_(false)
{
hwPrivateSize_ = 0;
if (NULL != initData) {
flags_ = initData->flags_;
hwPrivateSize_ = initData->hwPrivateSize_;
hwLocalSize_ = initData->hwLocalSize_;
}
// Workgroup info private memory size
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
hsa_ = false;
}
Kernel::~Kernel()
{
if (calRef_ == NULL) {
return;
}
{
Device::ScopedLockVgpus lock(dev());
// Release all virtual image objects on all virtual GPUs
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->releaseKernel(calImage());
}
}
if (0 != numCb_) {
delete [] cbSizes_;
}
}
const Device&
Kernel::dev() const
{
return reinterpret_cast<const Device&>(gpuDev_);
}
const Program&
Kernel::prog() const
{
return reinterpret_cast<const Program&>(prog_);
}
bool
NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa)
{
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
bool result = false;
AMUabiAddEncoding encoding;
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
size_t allocSize =
sizeof(uint) * MaxReadImage +
sizeof(CALUavEntry) * MaxUavArguments +
sizeof(CALSamplerMapEntry) * MaxSamplers +
sizeof(CALConstantBufferMask) * MaxConstBuffers +
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
char* tmpMem = new char[allocSize];
if (tmpMem == NULL) {
LogError("Error allocating memory");
return false;
}
CalcPtr(encoding.inputs, tmpMem, 0, 0);
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers);
if (shader->dep.NumIntrlIConstants != 0) {
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers);
}
AMUabiMultiBinary amuBinary;
amuABIMultiBinaryCreate(&amuBinary);
if (nullDev().settings().siPlus_) {
result = siCreateHwInfo(shader, encoding);
}
else {
result = r800CreateHwInfo(shader, encoding);
}
if (!result) {
delete [] tmpMem;
LogWarning("Error Creating program info");
return false;
}
addLoopConst(shader, encoding);
unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0;
unsigned int symbolCount=0;
CALOutputEntry* outputs=0;
unsigned int* globalBuffers=0;
unsigned int* persistentBuffers=0;
AMUabiUserSymbol* symbols=0;
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
CALConstantBufferMask* constBuffers = encoding.constBuffers;
uint* inputResources = encoding.inputs;
CALUavEntry* uav = encoding.uav;
uint inputSamplerCount = samplerSize();
for (uint i = 0; i < inputSamplerCount; ++i) {
inputSamplers[i].resource = 0;
inputSamplers[i].sampler = sampler(i)->index_;
}
uint constBufferCount = 2;
constBuffers[0].index = 0;
constBuffers[1].index = 1;
uint inputResourceCount = 0;
uint uavCount = 0;
bool globalBound = false;
bool cbBound = false;
bool printfBound = false;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
constBuffers[constBufferCount++].index = arg->index_;
break;
case KernelArg::PointerGlobal:
if (nullDev().settings().useAliases_) {
if (!globalBound) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
if (uavArena_ != 0) {
uav[uavCount].offset = uavArena_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
globalBound = true;
}
else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
case KernelArg::ConstBufId:
if (!cbBound) {
uav[uavCount].offset = cbId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
cbBound = true;
break;
case KernelArg::PrintfBufId:
if (!printfBound) {
uav[uavCount].offset = printfId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
printfBound = true;
break;
case KernelArg::UavId:
if (!nullDev().settings().useAliases_ &&
(UavIdUndefined != uavRaw_) &&
!(flags() & PrintfOutput)) {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
else {
if (UavIdUndefined != uavRaw_) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
if (uavArena_ != 0) {
uav[uavCount].offset = uavArena_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
break;
case KernelArg::Sampler:
inputSamplers[inputSamplerCount].resource = 0;
inputSamplers[inputSamplerCount].sampler = arg->index_;
inputSamplerCount++;
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
if (arg->memory_.readOnly_) {
inputResources[inputResourceCount++] = arg->index_;
}
else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
uav[uavCount].dimension = AMU_ABI_DIM_2D;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
default:
break;
}
}
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
}
encoding.machine = nullDev().hwInfo()->machine_;
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
encoding.inputCount = inputResourceCount;
encoding.outputCount = outputCount;
encoding.outputs = outputs;
encoding.condOut = condOut;
encoding.earlyExit = earlyExit;
encoding.globalBuffersCount = globalCount;
encoding.globalBuffers = globalBuffers;
encoding.persistentBuffersCount = persistentCount;
encoding.persistentBuffers = persistentBuffers;
encoding.constBuffersCount = constBufferCount;
encoding.inputSamplerMapCount = inputSamplerCount;
encoding.symbolsCount = symbolCount;
encoding.symbols = symbols;
encoding.uavCount = uavCount;
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
amuABIMultiBinaryDestroy(amuBinary);
delete [] tmpMem;
delete [] encoding.progInfos;
return (success == 0) ? false : true;
}
void
Kernel::findLocalWorkSize(
size_t workDim,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const
{
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
size_t thrPerGrp;
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if ((flags() & ImageEnable) &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) &&
((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) &&
((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if ((flags() & ImageWrite) ||
(thrPerGrp != nullDev().info().maxWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--);
lclWorkSize[d] = div;
tmp /= div;
}
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else
{
assert(0 && "Invalid workDim!");
}
}
}
}
else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
void
Kernel::setupProgramGrid(
VirtualGPU& gpu,
size_t workDim,
const amd::NDRange& glbWorkOffset,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize,
const amd::NDRange& groupOffset,
const amd::NDRange& glbWorkOffsetOrg,
const amd::NDRange& glbWorkSizeOrg
) const
{
// ABI is always in CB0
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* pGlobalSize = reinterpret_cast<uint*>
(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
uint* pLocalSize = reinterpret_cast<uint*>
(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
uint* pNumGroups = reinterpret_cast<uint*>
(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
uint* pGlobalOffset = reinterpret_cast<uint*>
(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
uint* pGroupOffset = reinterpret_cast<uint*>
(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
uint32_t* debugInfo = reinterpret_cast<uint*>
(cbBuf + DebugOffset * ConstBuffer::VectorSize);
uint* pNDRangeGlobalOffset = reinterpret_cast<uint*>
(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
// Check for 64-bit metadata
uint glbABIShift = (abi64Bit()) ? 1 : 0;
ProgramGrid* progGrid = &gpu.cal_.progGrid_;
// Finds local workgroup size
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
// Initialize the execution grid block and size/offset
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
pGlobalSize[3] = static_cast<uint>(workDim);
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
pLocalSize[3] = 0;
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
pNumGroups[3] = 0;
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
progGrid->gridBlock.width =
progGrid->gridBlock.height =
progGrid->gridBlock.depth = 1;
progGrid->gridSize.width =
progGrid->gridSize.height =
progGrid->gridSize.depth = 1;
progGrid->partialGridBlock.width =
progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 1;
bool partialGrid = false;
// Fill the right values, based on the application request
switch (workDim) {
case 3:
pLocalSize[2] =
progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
pGlobalOffset[2] = glbWorkOffset[2];
pGroupOffset[2] = groupOffset[2];
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
if (progGrid->partialGridBlock.depth != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.depth++;
pNumGroups[2]++;
}
else {
progGrid->partialGridBlock.depth = lclWorkSize[2];
}
}
// Fall through to fill 2D and 1D dimensions...
case 2:
pLocalSize[1] =
progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
progGrid->gridSize.height /= progGrid->gridBlock.height;
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
pGlobalOffset[1] = glbWorkOffset[1];
pGroupOffset[1] = groupOffset[1];
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
if (progGrid->partialGridBlock.height != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.height++;
pNumGroups[1]++;
}
else {
progGrid->partialGridBlock.height = lclWorkSize[1];
}
}
// Fall through to fill 1D dimension...
case 1:
pLocalSize[0] =
progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
progGrid->gridSize.width /= progGrid->gridBlock.width;
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
pGlobalOffset[0] = glbWorkOffset[0];
pGroupOffset[0] = groupOffset[0];
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
if (progGrid->partialGridBlock.width != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.width++;
pNumGroups[0]++;
}
else {
progGrid->partialGridBlock.width = lclWorkSize[0];
}
}
break;
default:
LogWarning("Wrong dimensions. Force to 1x1x1!");
break;
}
if (!partialGrid) {
progGrid->partialGridBlock.width =
progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 0;
}
// Calculate the total number of workitems and workgroups
pGlobalOffset[3] = pGroupOffset[3] = 1;
for (uint i = 0; i < workDim; ++i) {
pGlobalOffset[3] *= pGlobalOffset[i];
pGroupOffset[3] *= pGroupOffset[i];
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
if (abi64Bit()) {
// Setup the debug info in constant buffer
reinterpret_cast<uint64_t*>(debugInfo)[1] =
gpu.printfDbg().bufOffset();
// Size in DWORDs
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[4] /= sizeof(uint32_t);
}
else {
// Setup the debug info in constant buffer
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
// Size in DWORDs
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[2] /= sizeof(uint32_t);
}
}
}
bool
Kernel::initParameters()
{
size_t offset = 0;
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
// Initialize the arguments for the abstraction layer
if (arg->isCbNeeded()) {
desc.name_ = arg->name_.data();
desc.type_ = arg->type();
desc.size_ = arg->size(false);
desc.addressQualifier_ = arg->addressQualifier();
desc.accessQualifier_ = arg->accessQualifier();
desc.typeName_ = arg->typeName();
desc.typeQualifier_ = arg->typeQualifier();
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
}
// Report the allocated local memory size (emulated and hw)
if (hwLocalSize_ != 0) {
CondLog((dev().info().localMemSize_ < hwLocalSize_),
"Requested local size is bigger than reported");
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
if (!createSignature(params)) {
return false;
}
return true;
}
bool
Kernel::bindGlobalHwCb(
VirtualGPU& gpu,
VirtualGPU::GslKernelDesc* desc) const
{
bool result = true;
// Bind HW constant buffers used for the global data store
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (Program::HwConstBuffers::const_iterator it = gds.begin();
(it != gds.end() && result); ++it) {
uint idx = it->first;
result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
}
return result;
}
bool
Kernel::bindConstantBuffers(VirtualGPU& gpu) const
{
bool result = true;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more CBs for arguments!");
// Upload the parameters to HW and bind all constant buffers
for (uint i = 0; i < numCb_; i++) {
ConstBuffer* cb = gpu.constBufs_[i];
result &= cb->uploadDataToHw(cbSizes_[i]) &&
bindResource(gpu, *cb, i, ConstantBuffer, i, NULL, cb->wrtOffset());
}
return result;
}
bool
Kernel::processMemObjects(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const_address params,
bool nativeMem) const
{
bool aliases = false;
VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency();
bool readCache = !internal_ && rwAttributes_ && !dev().settings().assumeAliases_;
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
gpu.memoryDependency().newKernel();
// Check all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const KernelArg* arg = argument(i);
Memory* memory = NULL;
bool readOnly = false;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) &&
(arg->type_ != KernelArg::PointerLocal) &&
(arg->type_ != KernelArg::PointerHwLocal)) {
if (nativeMem) {
memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
}
else if (*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_) != NULL) {
memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_));
// Synchronize data with other memory instances if necessary
memory->syncCacheFromHost(gpu);
}
if (memory != NULL) {
readOnly = arg->memory_.readOnly_;
// Check if read cache optimization is possible
if (readCache) {
// Find if the same buffer was sent to other arguments (aliases)
for (size_t j = i + 1; (j < signature.numParameters()); ++j) {
const amd::KernelParameterDescriptor& descJ = signature.at(j);
const KernelArg* argJ = argument(j);
if (argJ->type_ == KernelArg::PointerGlobal) {
bool readOnlyJ = argJ->memory_.readOnly_;
Memory* memory2 = NULL;
if (nativeMem) {
memory2 = *reinterpret_cast<Memory* const*>
(params + descJ.offset_);
}
else if (*reinterpret_cast<amd::Memory* const*>
(params + descJ.offset_) != NULL) {
memory2 = dev().getGpuMemory(
*reinterpret_cast<amd::Memory* const*>
(params + descJ.offset_));
}
if (memory == memory2) {
if (!readOnly || !readOnlyJ) {
aliases = true;
break;
}
}
}
}
}
// Validate memory for a dependency in the queue
gpu.memoryDependency().validate(gpu, memory, readOnly);
}
}
}
return aliases & readCache;
}
bool
Kernel::loadParameters(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const_address params,
bool nativeMem) const
{
bool result = true;
uint i;
// Initialize local private ranges
if (!initLocalPrivateRanges(gpu)) {
return false;
}
if (!dev().settings().useAliases_ &&
(UavIdUndefined != uavRaw_) &&
(!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
// Bind a buffer for a dummy read
result = bindResource(gpu, *gpuMemory, 0,
ArgumentUavID, uavRaw_);
}
// Find all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (i = 0; i != signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Set current argument
if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) {
result = false;
break;
}
}
if (result) {
// Update the ring ranges and math constant
setLocalPrivateRanges(gpu);
result = bindConstantBuffers(gpu);
if (dev().settings().useAliases_) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBufferArena, uavArena_);
}
else if (flags() & PrivateFixed) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
}
}
return result;
}
bool
Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const
{
// 8xx workaround for the number of groups limit in HW
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
const ProgramGrid* programGrid = &gpu.cal()->progGrid_;
ConstBuffer* cb = gpu.numGrpCb();
assert((cb != NULL) && "Runtime must have the constant buffer");
uint32_t* memPtr = reinterpret_cast<uint32_t*>(cb->sysMemCopy());
memPtr[0] = programGrid->gridSize.width;
memPtr[1] = programGrid->gridSize.height;
memPtr[2] = programGrid->gridSize.depth;
memPtr[3] = 0;
memPtr[4] = programGrid->gridBlock.width;
memPtr[5] = programGrid->gridBlock.height;
memPtr[6] = programGrid->gridBlock.depth;
memPtr[7] = 0;
bool result = cb->uploadDataToHw(8 * sizeof(uint32_t));
if (result) {
gpu.setConstantBuffer(SC_INFO_CONSTANTBUFFER,
cb->gslResource(), static_cast<CALuint>(cb->wrtOffset()), cb->hbSize());
}
else {
assert(!"Runtime didn't upload data for NumGroup workaround");
return false;
}
}
if (!gpu.runProgramGrid(*calEvent,
const_cast<ProgramGrid*>(&gpu.cal()->progGrid_), gpu.vmMems(), gpu.cal_.memCount_)) {
LogError("Failed to execute the program!");
return false;
}
// Unbind all resources
unbindResources(gpu, *calEvent, lastRun);
return true;
}
static size_t counter = 0;
void
Kernel::debug(VirtualGPU& gpu) const
{
std::fstream stubWrite;
address src = NULL;
if (!dev().heap()->isVirtual()) {
src = reinterpret_cast<address>
(const_cast<Resource&>(dev().globalMem()).map(&gpu));
}
std::cerr << "--- " << name_ << " ---" << std::endl;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
Memory* gpuMem = gpu.slots_[i].memory_;
std::stringstream fileName;
bool bufferObj =
((arg->type_ == KernelArg::PointerGlobal) ||
(arg->type_ == KernelArg::PointerConst) ||
(arg->type_ == KernelArg::PointerHwConst));
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
std::cerr.setf(std::ios::hex);
uint* location = reinterpret_cast<uint*>
(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() <<
"_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(
reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
}
if (((arg->type_ >= KernelArg::Image1D) &&
(arg->type_ <= KernelArg::Image3D)) ||
((src == NULL) && bufferObj)) {
Memory* resource = gpu.slots_[i].memory_;
void* memory = resource->map(&gpu);
uint* location = reinterpret_cast<uint*>(memory);
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() <<
"_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(
reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
resource->unmap(&gpu);
}
}
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize());
gpu.constBufs_[i]->unmap(&gpu);
stubWrite.close();
}
}
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
uint idx = it->first;
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
(it->second)->unmap(&gpu);
stubWrite.close();
}
}
if (!dev().heap()->isVirtual()) {
const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
}
}
bool
Kernel::initConstBuffers()
{
bool result = true;
size_t i;
assert((numCb_ != 0) && "We have 0 constant buffers!");
// Allocate an array for CB sizes
cbSizes_ = new size_t[numCb_];
if (cbSizes_ == NULL) {
return false;
}
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
// CB0 is reserved for ABI data
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
// Find sizes of all constant buffers
for (i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
size_t size = arg->cbPos_ + arg->size(true);
size_t specVec = arg->specialVector();
if (specVec != 0) {
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) *
ConstBuffer::VectorSize;
}
// Do we need a CB?
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
cbSizes_[arg->cbIdx_] = size;
}
}
return result;
}
bool
Kernel::setInternalSamplers(VirtualGPU& gpu) const
{
for (uint i = 0; i < samplerSize(); ++i) {
const KernelArg* arg = sampler(i);
uint state = arg->cbPos_;
uint idx = arg->index_;
if (gpu.cal()->samplersState_[idx] != state) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
}
return true;
}
bool
Kernel::setArgument(
VirtualGPU& gpu,
uint idx,
const void* param,
size_t size,
bool nativeMem) const
{
bool result = true;
const KernelArg* arg;
address memory;
size_t argSize;
static const bool waitOnBusyEngine = true;
assert((idx < arguments_.size()) && "Param index is out of range!");
arg = argument(idx);
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
memory = gpu.cb(1)->sysMemCopy();
argSize = arg->size(true);
// Bind the global heap for emulation mode
switch (arg->type_) {
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
// Fall through ...
default:
break;
}
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
case KernelArg::PointerGlobal:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
bool forceZeroOffset = false;
if (gpuMem == NULL) {
forceZeroOffset = true;
gpuMem = dev().getGpuMemory(dev().dummyPage());
}
uint64_t offset = gpuMem->pinOffset();
// Make sure the passed argument is a buffer object
if (!gpuMem->cal()->buffer_) {
LogError("The kernel buffer argument isn't a buffer object!");
return false;
}
if (arg->type_ == KernelArg::PointerHwConst) {
// Bind current memory object with the kernel
if (!bindResource(gpu, *gpuMem, idx,
ArgumentConstBuffer, arg->index_, gpuMem)) {
return false;
}
assert((offset == 0) && "No offset for HW CB");
// Add a fake offset to make sure (ptr != NULL) is TRUE
offset = 1;
}
else {
ResourceType type = ArgumentHeapBuffer;
// Check if kernel expects UAV binding
if (arg->memory_.uavBuf_) {
type = ArgumentBuffer;
}
else {
if (blitKernelHack_) {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
else {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, dev().globalMem(), 0,
GlobalBuffer, uavRaw_)) {
return false;
}
}
}
// Bind current memory object with the kernel
// Note: it's a fake binding, if the buffer is part of
// the global heap
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_, gpuMem)) {
return false;
}
// Update offset only if we bind HeapBuffer or
// it's global address space in UAV setup on SI+
if ((type == ArgumentHeapBuffer) || dev().settings().siPlus_) {
if (!blitKernelHack_) {
offset += gpuMem->hbOffset();
if (!forceZeroOffset) {
assert((offset != 0) && "Offset 0 with a real allocation!");
}
}
gpu.addVmMemory(gpuMem);
}
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
if (forceZeroOffset) {
offset = 0;
}
// Copy memory offset into the constant buffer
if (abi64Bit()) {
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
}
else {
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) =
static_cast<uint>(offset);
}
}
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
if (gpuMem == NULL) {
return false;
}
// Make sure the passed argument is an image object
if (gpuMem->cal()->buffer_) {
LogError("The kernel image argument isn't an image object!");
return false;
}
ResourceType resType = arg->memory_.readOnly_ ?
ArgumentImageRead : ArgumentImageWrite;
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx,
resType, arg->index_, gpuMem)) {
return false;
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Copy image constants into the constant buffer
if (gpuMem->owner() != NULL) {
copyImageConstants(gpuMem->owner()->asImage(),
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
}
}
break;
case KernelArg::Sampler:
{
amd::Sampler* amdSampler =
*reinterpret_cast<amd::Sampler* const*>(param);
uint idx = arg->index_;
uint32_t state = amdSampler->state();
if (state != gpu.cal()->samplersState_[idx]) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
// Copy sampler state into the constant buffer
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
}
break;
case KernelArg::Counter:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx,
ArgumentCounter, idx, gpuMem)) {
return false;
}
}
break;
case KernelArg::PointerHwLocal:
{
// Calculate current offset in the local ring
uint offset = gpu.cal_.progGrid_.localSize;
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
offset = amd::alignUp(offset, arg->alignment_);
size_t memSize = *static_cast<const uintptr_t*>(param);
// Allocate new memory from the local ring
gpu.cal_.progGrid_.localSize += static_cast<uint>(memSize) + extra;
// Copy current local argument's offset into the CB
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
CondLog((gpu.cal_.progGrid_.localSize > dev().info().localMemSize_),
"Requested local size is bigger than reported!");
}
break;
case KernelArg::Float:
case KernelArg::Double:
case KernelArg::Char:
case KernelArg::UChar:
case KernelArg::Short:
case KernelArg::UShort:
case KernelArg::Int:
case KernelArg::UInt:
case KernelArg::Long:
case KernelArg::ULong:
if (size != argSize) {
LogWarning("Parameter's sizes are unmatched!");
}
// Fall through ...
case KernelArg::Struct:
case KernelArg::Union: {
size_t specVec = arg->specialVector();
if (specVec != 0) {
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
for (uint i = 0; i < iter; ++i) {
amd::Os::fastMemcpy((memory + arg->cbPos_ +
i * ConstBuffer::VectorSize),
reinterpret_cast<const char*>(param) +
i * KernelArg::VectorSizeLimit * specVec,
specVec * KernelArg::VectorSizeLimit);
}
}
else {
// Copy data into the CB
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
}
}
break;
default:
LogError("Unhandled argument's type!");
break;
}
return result;
}
bool
Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const
{
// Initialize HW local
gpu.cal_.progGrid_.localSize = hwLocalSize_;
// Bind the global buffer if emulated local or private memory
// was allocated by the kernel
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) &&
(uavRaw_ != UavIdUndefined)) {
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
// Bind the global buffer if emulated constant buffers are enabled
if (cbId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
return false;
}
}
// Bind the printf buffer
if (printfId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
return false;
}
}
// Initialize the iterations count
gpu.cal_.iterations_ = 1;
return true;
}
void
Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const
{
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* data;
uint gridSize =
gpu.cal()->progGrid_.gridSize.width *
gpu.cal()->progGrid_.gridSize.height *
gpu.cal()->progGrid_.gridSize.depth;
uint blockSize =
gpu.cal()->progGrid_.gridBlock.width *
gpu.cal()->progGrid_.gridBlock.height *
gpu.cal()->progGrid_.gridBlock.depth;
//! \todo validate if the compiler still generates PrivateFixed
if (flags() & PrivateFixed) {
// Update private ring
data = reinterpret_cast<uint*>
(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
if (abi64Bit()) {
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
data[2] = 0;
data[3] = 0;
}
else {
data[0] = static_cast<uint>(gpuMemory->hbOffset());
data[1] = 0;
data[2] = data[3] = 0;
}
gpu.addVmMemory(gpuMemory);
}
// Copy the math lib constants
amd::Os::fastMemcpy(
(cbBuf + MathLibOffset * ConstBuffer::VectorSize),
MathLibConst, sizeof(MathLibConst));
// Update the offset to the global data
if (prog().glbData() != NULL) {
gpu.addVmMemory(prog().glbData());
uint64_t glbDataOffset = prog().glbData()->hbOffset();
if (abi64Bit()) {
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset *
ConstBuffer::VectorSize) = glbDataOffset;
}
else {
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset *
ConstBuffer::VectorSize) = static_cast<uint>(glbDataOffset);
}
}
// Split workload if it was requested
if ((gpu.cal_.iterations_ < 2) &&
gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
uint totalSize = gridSize * blockSize;
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_,
(totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
}
}
// Initialize the number of iterations to the grid size
if (flags() & PrintfOutput) {
gpu.cal_.iterations_ = gridSize;
}
}
void
Kernel::setSampler(
VirtualGPU& gpu,
uint32_t state,
uint physUnit
) const
{
// All CAL sampler's parameters are in floats
float gslAddress = GSL_CLAMP_TO_BORDER;
float gslMinFilter = GSL_MIN_NEAREST;
float gslMagFilter = GSL_MAG_NEAREST;
state &= ~amd::Sampler::StateNormalizedCoordsMask;
// Program the sampler address mode
switch (state & amd::Sampler::StateAddressMask) {
case amd::Sampler::StateAddressRepeat:
gslAddress = GSL_REPEAT;
break;
case amd::Sampler::StateAddressClampToEdge:
gslAddress = GSL_CLAMP_TO_EDGE;
break;
case amd::Sampler::StateAddressMirroredRepeat:
gslAddress = GSL_MIRRORED_REPEAT;
break;
case amd::Sampler::StateAddressClamp:
case amd::Sampler::StateAddressNone:
default:
break;
}
state &= ~amd::Sampler::StateAddressMask;
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
// Program texture filter mode
if (state == amd::Sampler::StateFilterLinear) {
gslMinFilter = GSL_MIN_LINEAR;
gslMagFilter = GSL_MAG_LINEAR;
}
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
}
bool
Kernel::bindResource(
VirtualGPU& gpu,
const Resource& resource,
uint paramIdx,
ResourceType type,
uint physUnit,
Memory* memory,
size_t offset) const
{
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
// Find the original resource name from the IL program
switch (type) {
case GlobalBuffer:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavRaw_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case GlobalBufferArena:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavArena_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentCbID:
if (gpu.state_.boundCb_) {
return true;
}
gpu.state_.boundCb_ = true;
physUnit = cbId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentPrintfID:
if (gpu.state_.boundPrintf_) {
return true;
}
gpu.state_.boundPrintf_ = true;
physUnit = printfId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentHeapBuffer:
case ArgumentBuffer:
case ArgumentImageRead:
case ArgumentImageWrite:
case ArgumentConstBuffer:
case ArgumentCounter:
// Early exit if resource is bound already
if (gpu.slots_[paramIdx].state_.bound_) {
return true;
}
// Associate resource with the slot
gpu.slots_[paramIdx].memory_ = memory;
// Mark resource as bound
gpu.slots_[paramIdx].state_.bound_ = true;
if (type == ArgumentCounter) {
GpuEvent calEvent;
// Bind memory with atomic counter
gpu.bindAtomicCounter(argument(paramIdx)->index_,
memory->gslResource());
// Copy the counter value into GDS
gpu.syncAtomicCounter(calEvent, argument(paramIdx)->index_, false);
// Mark resource as busy
memory->setBusy(gpu, calEvent);
return true;
}
else if (type == ArgumentHeapBuffer) {
// We return here, since we just have to bind the global heap
return true;
}
else if (type == ArgumentConstBuffer) {
gpu.slots_[paramIdx].state_.constant_ = true;
}
break;
case ArgumentUavID:
case ConstantBuffer:
break;
default:
LogPrintfError("Unspecified argument type ()!", type);
return false;
}
gslMemObject gslMem = NULL;
// Use global address space on SI+ for UAV setup
if (dev().settings().siPlus_ &&
((type == ArgumentBuffer) || (type == ArgumentCbID) ||
(type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
!blitKernelHack_) {
gslMem = dev().heap()->resource().gslResource();
}
else {
gslMem = resource.gslResource();
}
// Associate memory with the physical unit, the actual binding
bool result = true;
switch (type) {
case GlobalBuffer:
case GlobalBufferArena:
case ArgumentBuffer:
case ArgumentImageWrite:
case ArgumentUavID:
case ArgumentCbID:
case ArgumentPrintfID:
if (type == ArgumentImageWrite) {
uavType = GSL_UAV_TYPE_TYPED;
}
else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
uavType = GSL_UAV_TYPE_TYPELESS;
}
if (gpu.cal_.uavs_[physUnit] != gslMem) {
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
gpu.setUAVChannelOrder(physUnit, gslMem);
gpu.cal_.uavs_[physUnit] = gslMem;
}
else if (!dev().settings().siPlus_)
{
gpu.setUAVChannelOrder(physUnit, gslMem);
}
break;
case ConstantBuffer:
case ArgumentConstBuffer:
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
result = gpu.setConstantBuffer(physUnit,
gslMem, offset, resource.hbSize());
gpu.cal_.constBuffers_[physUnit] = gslMem;
}
break;
case ArgumentImageRead:
if (gpu.cal_.readImages_[physUnit] != gslMem) {
result = gpu.setInput(physUnit, gslMem);
gpu.cal_.readImages_[physUnit] = gslMem;
}
break;
default:
result = false;
assert(false);
break;
}
if (!result) {
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
return false;
}
if ((type == GlobalBuffer) && dev().settings().useAliases_) {
if (uavArena_ != 0) {
if (!setupArenaAliases(gpu, resource)) {
return false;
}
if ((uavArena_ != physUnit) &&
(gpu.cal_.uavs_[uavArena_] != gslMem)) {
gpu.cal_.uavs_[uavArena_] = gslMem;
// Associate memory with the name
if (!gpu.setUAVBuffer(uavArena_, gslMem,
GSL_UAV_TYPE_TYPELESS)) {
LogError("calCtxSetMem failed!");
return false;
}
gpu.setUAVChannelOrder(uavArena_, gslMem);
}
}
}
return true;
}
void
Kernel::unbindResources(
VirtualGPU& gpu,
GpuEvent calEvent,
bool lastRun) const
{
// Make sure unbind occurs on the last run, in case the execution had a split
if (lastRun) {
for (uint i = 0; i < arguments_.size(); ++i) {
if (gpu.slots_[i].state_.bound_) {
GpuEvent calEventTmp = calEvent;
if (KernelArg::Counter == argument(i)->type_) {
// Copy the counter value from GDS
gpu.syncAtomicCounter(calEventTmp, argument(i)->index_, true);
}
else if (!(gpu.slots_[i].state_.constant_ ||
argument(i)->memory_.readOnly_)) {
// Signal the abstraction layer that GPU memory is dirty
if (gpu.slots_[i].memory_->owner() != NULL) {
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
}
}
// Mark resource as busy
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
gpu.slots_[i].state_.value_ = 0;
}
}
// Unbind the global buffer
gpu.state_.boundGlobal_ = false;
// Unbind the constant buffer
gpu.state_.boundCb_ = false;
// Unbind the pritnf buffer
gpu.state_.boundPrintf_ = false;
}
// Mark CB busy
for (uint i = 0; i < numCb_; ++i) {
gpu.constBufs_[i]->setBusy(gpu, calEvent);
}
// 8xx workaround for the number of groups limit in HW
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
ConstBuffer* cb = gpu.numGrpCb();
assert((cb != NULL) && "Runtime must have the constant buffer");
cb->setBusy(gpu, calEvent);
}
// Set the event object for the scratch buffer
if (workGroupInfo()->scratchRegs_ > 0) {
for (uint i = 0; i < dev().scratch(gpu.hwRing())->memObjs_.size(); ++i) {
dev().scratch(gpu.hwRing())->memObjs_[i]->setBusy(gpu, calEvent);
}
}
}
bool
Kernel::setupArenaAliases(VirtualGPU& gpu, const Resource& resource) const
{
const static uint ScArenaUavShortId = 9;
const static uint ScArenaUavByteId = 10;
Resource* buf = &(const_cast<Resource&>(resource));
Resource* alias;
gslMemObject gslMem = NULL;
//
// byte view
//
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R8I);
if (NULL == alias) {
return false;
}
gslMem = alias->gslResource();
if (gpu.cal_.uavs_[ScArenaUavByteId] != gslMem) {
if (!gpu.setUAVBuffer(ScArenaUavByteId,
gslMem, GSL_UAV_TYPE_TYPELESS)) {
return false;
}
gpu.cal_.uavs_[ScArenaUavByteId] = gslMem;
}
//
// short view
//
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R16I);
if (NULL == alias) {
return false;
}
bool result = true;
gslMem = alias->gslResource();
if (gpu.cal_.uavs_[ScArenaUavShortId] != gslMem) {
result = gpu.setUAVBuffer(ScArenaUavShortId,
gslMem, GSL_UAV_TYPE_TYPELESS);
gpu.cal_.uavs_[ScArenaUavShortId] = gslMem;
}
return result;
}
void
Kernel::copyImageConstants(
const amd::Image* amdImage,
ImageConstants* imageData
) const
{
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
imageData->dataType_ =
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
imageData->channelOrder_ =
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
}
union MetadataVersion {
struct {
uint64_t revision_: 16; //!< LLVM metadata revision
uint64_t minorVersion_: 16; //!< LLVM metadata minor verison
uint64_t majorVersion_: 16; //!< LLVM metadata major version
};
uint64_t value_;
MetadataVersion(uint mj, uint mi, uint rev): value_(0)
{
revision_ = rev;
minorVersion_ = mi;
majorVersion_ = mj;
}
MetadataVersion(): value_(0) {}
};
//! Version of metadata with buffer attributes
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
//! Version of metadata with type qualifiers
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
bool
NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
{
// Initialize workgroup info
workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_;
MetadataVersion mdVersion;
// Find first tag
size_t pos = metaData.find(";");
// Loop through all provided program arguments
while (pos != std::string::npos) {
KernelArg arg;
if (!expect(metaData, &pos, ";")) {
break;
}
arg.type_ = KernelArg::None;
// Loop through all available metadata types
for (uint i = 0; i < ArgStateTotal; ++i) {
uint tmpValue;
// Find the name tag
if (expect(metaData, &pos, ArgState[i].typeName_)) {
switch (ArgState[i].type_) {
case KernelArg::None:
// Process next ...
continue;
case KernelArg::Reflection: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
LogWarning("Couldn't get the argument type!");
return false;
}
}
continue;
case KernelArg::ConstArg: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
}
continue;
case KernelArg::Grouping:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size!");
return false;
}
workGroupInfo_.compileSize_[j] = temp;
}
// Process next ...
continue;
case KernelArg::WrkgrpSize: {
uint temp;
// Read the workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the workgroup size!");
return false;
}
workGroupInfo_.size_ = temp;
}
// Process next ...
continue;
case KernelArg::Wavefront:
// Process next ...
continue;
case KernelArg::UavId:
// Read index
if (!getuint(metaData, &pos, &arg.index_)) {
return false;
}
break;
case KernelArg::ConstBufId:
// Read index
if (!getuint(metaData, &pos, &cbId_)) {
return false;
}
continue;
case KernelArg::PrintfBufId:
// Read index
if (!getuint(metaData, &pos, &printfId_)) {
return false;
}
continue;
case KernelArg::MetadataVersion:
// Read metadata version
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.majorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.minorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.revision_ = tmpValue;
// Process next ...
continue;
case KernelArg::GroupingHint:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size hint
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size hint!");
return false;
}
workGroupInfo_.compileSizeHint_[j] = temp;
}
// Process next ...
continue;
case KernelArg::VecTypeHint:
{
std::string temp;
// Read the compile vector type hint
if (!getstring(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile vector type hint!");
return false;
}
workGroupInfo_.compileVecTypeHint_ = temp;
}
// Process next ...
continue;
default:
break;
}
char argName[256];
// Save the argument type
arg.type_ = ArgState[i].type_;
// Check if we should expect the name
if (ArgState[i].name_) {
// Read the parameter's name
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.name_ = argName;
}
if (arg.type_ == KernelArg::Sampler) {
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.location_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the resource data type
if (ArgState[i].resType_) {
uint k;
// Search for the data type
for (k = 0; k < DataTypeTotal; k++) {
if (expect(metaData, &pos, DataType[k].tagName_)) {
arg.dataType_ = DataType[k].type_;
if (arg.type_ == KernelArg::Image) {
flags_ |= ImageEnable;
if (expect(metaData, &pos, "RO:")) {
arg.memory_.readOnly_ = 1;
}
else if (expect(metaData, &pos, "RW:")) {
arg.memory_.readWrite_ = 1;
flags_ |= ImageWrite;
}
else if (expect(metaData, &pos, "WO:")) {
arg.memory_.writeOnly_ = 1;
flags_ |= ImageWrite;
}
}
else if (arg.type_ == KernelArg::Value) {
arg.type_ = DataType[k].type_;
}
break;
}
}
if (k == DataTypeTotal) {
LogWarning("We couldn't find the argument's type.");
if ((arg.type_ == KernelArg::Value) ||
!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
//! @todo temporary condition
if ((arg.type_ == KernelArg::Opaque) ||
(arg.type_ == KernelArg::Sampler)) {
assert(false);
continue;
}
}
// Check if we should expect the data size
if (ArgState[i].size_) {
uint tmpData;
// Read the data size
if (!getuint(metaData, &pos, &tmpData)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.type_ == KernelArg::Image) {
arg.type_ = arg.dataType_;
arg.index_ = tmpData;
}
else {
arg.size_ = tmpData;
}
}
if (arg.type_ == KernelArg::Counter) {
// Read a counter index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a counter index!");
return false;
}
}
// Check if we should expect a resource index
if (ArgState[i].cbIdx_) {
// Read resource index
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
numCb_ = arg.cbIdx_;
}
}
// Check if we should expect the CB offset
if (ArgState[i].cbPos_) {
// Read position in the constant buffer
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the buffer type
if (ArgState[i].buf_) {
// Read the buffer type
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.buf_ = argName;
for (uint k = 0; k < BufTypeTotal; ++k) {
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
// Update the parameter type
arg.type_ = BufType[k].type_;
// Check if we should expect a buffer index
if (BufType[k].number_) {
// Read a buffer index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the required alignment
if (BufType[k].alignment_) {
// Read data alignment
if (!getuint(metaData, &pos, &arg.alignment_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the buffer's attribute
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) &&
BufType[k].attribute_) {
if (expect(metaData, &pos, "RO")) {
arg.memory_.readOnly_ = 1;
}
else if (expect(metaData, &pos, "RW")) {
arg.memory_.readWrite_ = 1;
}
else if (expect(metaData, &pos, "WO")) {
arg.memory_.writeOnly_ = 1;
}
}
// Check for the type qualifier
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) &&
BufType[k].attribute_) {
uint tmp;
pos += 1;
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get volatile type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get restrict type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
}
}
}
}
// Find multiple UAV references
switch (arg.type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
case KernelArg::UavId:
uavRefCount[arg.index_]++;
break;
default:
break;
}
// Check if this argument will be passed in constant buffer
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
if (arg.type_ == KernelArg::Sampler) {
// Serach for the passed by value sampler
for (uint i = 0; i < argSize(); ++i) {
KernelArg* value = arguments_[i];
if (0 == value->name_.compare(arg.name_)) {
value->type_ = arg.type_;
value->index_ = arg.index_;
value->location_ = 0;
break;
}
}
}
else {
KernelArg* argument = new KernelArg(arg);
if (argument != NULL) {
addArgument(argument);
}
else {
LogError("Couldn't allocate memory!");
return false;
}
}
}
// Check if we have a pre-defined sampler
else if (arg.type_ == KernelArg::Sampler) {
KernelArg* sampler = new KernelArg(arg);
if (sampler != NULL) {
addSampler(sampler);
}
else {
LogError("Couldn't allocate memory!");
return false;
}
}
break;
}
}
// Next argument
pos = metaData.find(";", pos);
}
// Find arguments that will require a reallocation
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
// Check if can't use a dedicated UAV,
// so realloc memory in the heap
arg->memory_.realloc_ = isRealloc();
if (nullDev().settings().useAliases_) {
if (uavRefCount[arg->index_] > 1) {
// Multiple accesses, assume this is the heap
uavRaw_ = arg->index_;
}
// Mark argument as an UAV buffer if no aliases or it's not arena
else if (arg->index_ != VirtualGPU::UavArena) {
arg->memory_.uavBuf_ = true;
}
}
else {
arg->memory_.uavBuf_ = true;
}
break;
case KernelArg::PointerHwConst:
arg->memory_.realloc_ = true;
break;
case KernelArg::UavId:
if (!nullDev().settings().useAliases_ ||
(arg->index_ != VirtualGPU::UavArena)) {
uavRaw_ = arg->index_;
}
break;
default:
break;
}
// If argument marked with the const qualifier, then overwrite
// Read-Write attributes, since compiler doesn't mark it properly
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
arg->memory_.readOnly_ = 1;
arg->memory_.readWrite_ = 0;
arg->memory_.writeOnly_ = 0;
}
}
if (!nullDev().settings().useAliases_ &&
(uavRaw_ != UavIdUndefined) &&
!(flags() & PrintfOutput)) {
// Find if default UAV is already assigned to an argument
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (uavRaw_ == arg->index_) {
uavRaw_ = UavIdUndefined;
}
break;
default:
break;
}
}
}
// There is always 1 constant buffer, associated with the kernel
numCb_++;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more than max CBs for arguments!");
// Limit workgroup size if requested
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
size_t temp = 1;
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
for (uint j = 0; j < 3; ++j) {
if (workGroupInfo()->compileSize_[j] != 0) {
temp *= workGroupInfo_.compileSize_[j];
}
}
// Report a compilation error if requested compile size doesn't
// match the required workgroup size
if (workGroupInfo()->size_ < temp) {
char str[8];
intToStr(workGroupInfo_.size_, str, 8);
buildError_ = CL_OUT_OF_RESOURCES;
buildLog_ +=
"Error: Requested compile size is bigger than the required workgroup size of ";
buildLog_ += str;
buildLog_ += " elements\n";
LogError(buildLog().c_str());
return false;
}
}
// Read/Write attributes are provided in metadata
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
rwAttributes_ = true;
}
return true;
}
bool
Kernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
// Check if memory doesn't require reallocation
bool noRealloc = (!argument(idx)->memory_.realloc_ ||
amdMem->reallocedDeviceMemory(&dev()));
return noRealloc;
}
inline static HSAIL_ARG_TYPE
GetHSAILArgType(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return HSAIL_ARGTYPE_POINTER;
case ARG_TYPE_QUEUE:
return HSAIL_ARGTYPE_QUEUE;
case ARG_TYPE_VALUE:
return HSAIL_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return HSAIL_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
return HSAIL_ARGTYPE_SAMPLER;
case ARG_TYPE_ERROR:
default:
return HSAIL_ARGTYPE_ERROR;
}
}
inline static size_t
GetHSAILArgAlignment(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return argInfo->arg.pointer.align;
default:
return 1;
}
}
inline static HSAIL_ADDRESS_QUALIFIER
GetHSAILAddrQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT_EMU:
case PTR_MT_CONSTANT:
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return HSAIL_ADDRESS_LOCAL;
case PTR_MT_SCRATCH_EMU:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_ERROR:
default:
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
}
}
else if ((argInfo->type == ARG_TYPE_IMAGE) ||
(argInfo->type == ARG_TYPE_SAMPLER)) {
return HSAIL_ADDRESS_GLOBAL;
}
else if (argInfo->type == ARG_TYPE_QUEUE) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE
GetHSAILDataType(const aclArgData* argInfo)
{
aclArgDataType dataType;
if (argInfo->type == ARG_TYPE_POINTER) {
dataType = argInfo->arg.pointer.data;
}
else if (argInfo->type == ARG_TYPE_VALUE) {
dataType = argInfo->arg.value.data;
}
else {
return HSAIL_DATATYPE_ERROR;
}
switch (dataType) {
case DATATYPE_i1:
return HSAIL_DATATYPE_B1;
case DATATYPE_i8:
return HSAIL_DATATYPE_S8;
case DATATYPE_i16:
return HSAIL_DATATYPE_S16;
case DATATYPE_i32:
return HSAIL_DATATYPE_S32;
case DATATYPE_i64:
return HSAIL_DATATYPE_S64;
case DATATYPE_u8:
return HSAIL_DATATYPE_U8;
case DATATYPE_u16:
return HSAIL_DATATYPE_U16;
case DATATYPE_u32:
return HSAIL_DATATYPE_U32;
case DATATYPE_u64:
return HSAIL_DATATYPE_U64;
case DATATYPE_f16:
return HSAIL_DATATYPE_F32;
case DATATYPE_f32:
return HSAIL_DATATYPE_F32;
case DATATYPE_f64:
return HSAIL_DATATYPE_F64;
case DATATYPE_struct:
return HSAIL_DATATYPE_STRUCT;
case DATATYPE_opaque:
return HSAIL_DATATYPE_OPAQUE;
case DATATYPE_ERROR:
default:
return HSAIL_DATATYPE_ERROR;
}
}
inline static int
GetHSAILArgSize(const aclArgData *argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_VALUE:
switch (GetHSAILDataType(argInfo)) {
case HSAIL_DATATYPE_B1:
return 1;
case HSAIL_DATATYPE_B8:
case HSAIL_DATATYPE_S8:
case HSAIL_DATATYPE_U8:
return 1;
case HSAIL_DATATYPE_B16:
case HSAIL_DATATYPE_U16:
case HSAIL_DATATYPE_S16:
case HSAIL_DATATYPE_F16:
return 2;
case HSAIL_DATATYPE_B32:
case HSAIL_DATATYPE_U32:
case HSAIL_DATATYPE_S32:
case HSAIL_DATATYPE_F32:
return 4;
case HSAIL_DATATYPE_B64:
case HSAIL_DATATYPE_U64:
case HSAIL_DATATYPE_S64:
case HSAIL_DATATYPE_F64:
return 8;
case HSAIL_DATATYPE_STRUCT:
return argInfo->arg.value.numElements;
default:
return -1;
}
case ARG_TYPE_POINTER:
case ARG_TYPE_IMAGE:
case ARG_TYPE_SAMPLER:
case ARG_TYPE_QUEUE:
return sizeof(void*);
default:
return -1;
}
}
inline static clk_value_type_t
GetOclType(const aclArgData* argInfo)
{
static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
};
uint sizeType;
if (argInfo->type == ARG_TYPE_QUEUE) {
return T_QUEUE;
}
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
return T_POINTER;
}
else if (argInfo->type == ARG_TYPE_VALUE) {
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
sizeType = 0;
break;
case DATATYPE_i16:
case DATATYPE_u16:
sizeType = 1;
break;
case DATATYPE_i32:
case DATATYPE_u32:
sizeType = 2;
break;
case DATATYPE_i64:
case DATATYPE_u64:
sizeType = 3;
break;
case DATATYPE_f16:
case DATATYPE_f32:
sizeType = 4;
break;
case DATATYPE_f64:
sizeType = 5;
break;
default:
return T_VOID;
}
switch (argInfo->arg.value.numElements) {
case 1: return ClkValueMapType[sizeType][0];
case 2: return ClkValueMapType[sizeType][1];
case 3: return ClkValueMapType[sizeType][2];
case 4: return ClkValueMapType[sizeType][3];
case 8: return ClkValueMapType[sizeType][4];
case 16: return ClkValueMapType[sizeType][5];
default: return T_VOID;
}
}
else if (argInfo->type == ARG_TYPE_SAMPLER) {
return T_SAMPLER;
}
else {
return T_VOID;
}
}
inline static cl_kernel_arg_address_qualifier
GetOclAddrQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
}
else if (argInfo->type == ARG_TYPE_IMAGE) {
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
}
//default for all other cases
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
inline static cl_kernel_arg_access_qualifier
GetOclAccessQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_IMAGE) {
switch (argInfo->arg.image.type) {
case ACCESS_TYPE_RO:
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
case ACCESS_TYPE_WO:
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
case ACCESS_TYPE_RW:
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
return CL_KERNEL_ARG_ACCESS_NONE;
}
inline static cl_kernel_arg_type_qualifier
GetOclTypeQual(const aclArgData* argInfo)
{
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (argInfo->type == ARG_TYPE_POINTER) {
if (argInfo->arg.pointer.isVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (argInfo->arg.pointer.isRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
if (argInfo->arg.pointer.isPipe) {
rv |= CL_KERNEL_ARG_TYPE_PIPE;
}
if (argInfo->isConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
}
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
rv |= CL_KERNEL_ARG_TYPE_CONST;
break;
default:
break;
}
}
return rv;
}
static int
GetOclSize(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER: return sizeof(void *);
case ARG_TYPE_VALUE:
//! \note OCL 6.1.5. For 3-component vector data types,
//! the size of the data type is 4 * sizeof(component).
switch (argInfo->arg.value.data) {
case DATATYPE_struct:
return 1 * argInfo->arg.value.numElements;
case DATATYPE_i8:
case DATATYPE_u8:
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_ERROR:
default: return -1;
}
case ARG_TYPE_IMAGE: return sizeof(cl_mem);
case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
case ARG_TYPE_QUEUE: return sizeof(cl_command_queue);
default: return -1;
}
}
void
HSAILKernel::initArgList(const aclArgData* aclArg)
{
// Initialize the hsail argument list too
initHsailArgs(aclArg);
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
// Reserved arguments for HSAIL launch
aclArg += ExtraArguments;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(aclArg);
desc.addressQualifier_ = GetOclAddrQual(aclArg);
desc.accessQualifier_ = GetOclAccessQual(aclArg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
}
else {
desc.size_ = GetOclSize(aclArg);
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
flags_.imageEna_ = true;
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
flags_.imageWriteEna_ = true;
}
}
}
createSignature(params);
}
void
HSAILKernel::initHsailArgs(const aclArgData* aclArg)
{
int offset = 0;
// Reserved arguments for HSAIL launch
aclArg += ExtraArguments;
// Iterate through the each kernel argument
for (; aclArg->struct_size != 0; aclArg++) {
Argument* arg = new Argument;
// Initialize HSAIL kernel argument
arg->name_ = aclArg->argStr;
arg->typeName_ = aclArg->typeStr;
arg->size_ = GetHSAILArgSize(aclArg);
arg->offset_ = offset;
arg->type_ = GetHSAILArgType(aclArg);
arg->addrQual_ = GetHSAILAddrQual(aclArg);
arg->dataType_ = GetHSAILDataType(aclArg);
// If vector of args we add additional arguments to flatten it out
arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) &&
(aclArg->arg.value.data != DATATYPE_struct)) ?
aclArg->arg.value.numElements : 1;
arg->alignment_ = GetHSAILArgAlignment(aclArg);
offset += GetHSAILArgSize(aclArg);
arguments_.push_back(arg);
}
}
void
HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
{
PrintfInfo info;
uint index = 0;
for (; aclPrintf->struct_size != 0; aclPrintf++) {
index = aclPrintf->ID;
if (printf_.size() <= index) {
printf_.resize(index + 1);
}
info.fmtString_ = aclPrintf->fmtStr;
info.fmtString_ += "\n";
uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
info.arguments_.push_back(*tmp_ptr);
}
printf_[index] = info;
info.arguments_.clear();
}
}
HSAILKernel::HSAILKernel(std::string name,
HSAILProgram* prog,
std::string compileOptions)
: device::Kernel(name)
, compileOptions_(compileOptions)
, dev_(prog->dev())
, prog_(*prog)
, index_(0)
, code_(NULL)
, hwMetaData_(NULL)
{
hsa_ = true;
}
HSAILKernel::~HSAILKernel()
{
while (!arguments_.empty()) {
Argument* arg = arguments_.back();
delete arg;
arguments_.pop_back();
}
delete [] hwMetaData_;
delete code_;
}
bool
HSAILKernel::init()
{
acl_error error;
//compile kernel down to ISA
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
std::string options(compileOptions_.c_str());
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Get the ISA out
size_t size_isa;
void* shader_isa = NULL;
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(),
options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
if (error != ACL_SUCCESS) {
LogError("Failed to finalize");
return false;
}
shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
if (shader_isa == NULL) {
LogError("Failed find the ISA");
return false;
}
aqlCreateHWInfo(shader_isa, size_isa);
// Pull out metadata from the ELF
size_t sizeOfArgList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
char* aclArgList = new char[sizeOfArgList];
if (NULL == aclArgList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the argList
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
delete [] aclArgList;
size_t sizeOfWorkGroupSize;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().getAttribs().wavefrontSize;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ =
workGroupInfo_.compileSize_[0] *
workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
}
else {
workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Make sure kernel has any printf info
if (0 != sizeOfPrintfList) {
char* aclPrintfList = new char[sizeOfPrintfList];
if (NULL == aclPrintfList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
&sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the PrintfList
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
delete [] aclPrintfList;
}
size_t sizeOfDevice;
bool hasKernelEnqueue = false;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
&hasKernelEnqueue, &sizeOfDevice);
if (error != ACL_SUCCESS) {
return false;
}
flags_.dynamicParallelism_ = hasKernelEnqueue;
int index = -1;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_KERNEL_INDEX, openClKernelName.c_str(),
&index, &sizeOfDevice);
if (error != ACL_SUCCESS) {
return false;
}
index_ = static_cast<uint>(index);
return true;
}
bool
HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
// Check if memory doesn't require reallocation
bool noRealloc = true;
//amdMem->reallocedDeviceMemory(&dev()));
return noRealloc;
}
const Device&
HSAILKernel::dev() const
{
return reinterpret_cast<const Device&>(dev_);
}
const HSAILProgram&
HSAILKernel::prog() const
{
return reinterpret_cast<const HSAILProgram&>(prog_);
}
void
HSAILKernel::findLocalWorkSize(
size_t workDim,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const
{
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
size_t thrPerGrp;
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) &&
((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) &&
((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ ||
(thrPerGrp != dev().info().maxWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--);
lclWorkSize[d] = div;
tmp /= div;
}
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else
{
assert(0 && "Invalid workDim!");
}
}
}
}
else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
inline static void
WriteAqlArg(
unsigned char** dst,//!< The write pointer to the buffer
const void* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment = 0 //!< The alignment to follow while writing to the buffer
)
{
if (alignment == 0) {
*dst = amd::alignUp(*dst, size);
}
else {
*dst = amd::alignUp(*dst, alignment);
}
memcpy(*dst, src, size);
*dst += size;
}
HsaAqlDispatchPacket*
HSAILKernel::loadArguments(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const amd::NDRangeContainer& sizes,
const_address parameters,
bool nativeMem,
uint64_t vmDefQueue,
uint64_t* vmParentWrap,
std::vector<const Resource*>& memList) const
{
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
// The HLC generates 3 additional arguments for the global offsets
//and fourth argument is the printf_buffer pointer
size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
for (uint i = 0; i < sizes.dimensions(); ++i) {
offsetSize[i] = sizes.offset()[i];
}
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_BUSY;
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(sizeof(AmdAqlWrap));
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
offsetSize[4] = vmDefQueue;
offsetSize[5] = *vmParentWrap;
memList.push_back(cb);
}
// Check if the kernel may have printf output
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
offsetSize[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
}
WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Find all parameters for the current kernel
for (uint i = 0; i != signature.numParameters(); ++i) {
const HSAILKernel::Argument* arg = argument(i);
const amd::KernelParameterDescriptor& desc = signature.at(i);
const_address paramaddr = parameters + desc.offset_;
switch (arg->type_) {
case HSAIL_ARGTYPE_POINTER:
// If it is a global pointer
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
Memory* gpuMem = NULL;
amd::Memory* mem = NULL;
if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
if (mem != NULL) {
gpuMem = dev().getGpuMemory(mem);
gpuMem->wait(gpu, WaitOnBusyEngine);
memList.push_back(gpuMem);
}
else {
return NULL;
}
break;
}
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
}
else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem != NULL) {
gpuMem = dev().getGpuMemory(mem);
}
}
if (gpuMem == NULL) {
WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
break;
}
//! @todo 64 bit isn't supported with 32 bit binary
uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, WaitOnBusyEngine);
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) &&
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
}
// If it is a local pointer
else {
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
"Unsupported address type");
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
}
break;
case HSAIL_ARGTYPE_VALUE:
// Special case for structrues
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
// Copy the current structre into CB1
memcpy(aqlStruct, paramaddr, arg->size_);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(arg->size_);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
memList.push_back(cb);
}
else {
WriteAqlArg(&aqlArgBuf, paramaddr,
arg->numElem_ * arg->size_, arg->size_);
}
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = NULL;
amd::Memory* mem = NULL;
if (nativeMem) {
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
}
else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem == NULL) {
LogError( "The kernel image argument isn't an image object!");
return false;
}
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
}
else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) &&
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(image);
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
const amd::Sampler* sampler =
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
const Sampler* gpuSampler = static_cast<Sampler*>
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
}
else {
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
break;
}
case HSAIL_ARGTYPE_QUEUE: {
const amd::DeviceQueue* queue =
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
default:
LogError(" Unsupported address type ");
return NULL;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return NULL;
}
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
uint argBufSize = amd::alignUp(
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
gpu.cb(0)->sysMemCopy() + argBufSize);
amd::NDRange local(sizes.local());
amd::NDRange global(sizes.global());
// Initialize the Global, Local and Offset values
aqlPkt->dimensions = sizes.dimensions();
// Initialize the work grid parameter
for (uint idx = 0; idx < 3; idx++) {
aqlPkt->grid_size[idx] = 1;
aqlPkt->workgroup_size[idx] = 1;
}
// Check if runtime has to find local workgroup size
findLocalWorkSize(aqlPkt->dimensions, global, local);
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
aqlPkt->grid_size[idx] = global[idx];
aqlPkt->workgroup_size[idx] = local[idx];
}
// Initialize if dispatch should enable profiling
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
// Initialize kernel ISA and execution buffer requirements
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
aqlPkt->private_segment_size_bytes = spillSegSize();
// Initialize cache flush configuration for the dispatch
//! @todo Currently not used in emulation
aqlPkt->barrier = 1;
aqlPkt->release_fence_scope = 1;
aqlPkt->acquire_fence_scope = 2;
aqlPkt->invalidate_instruction_cache = 1;
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
aqlPkt->kernel_arg_address = argList;
memList.push_back(cb);
memList.push_back(gpuAqlCode());
if (NULL != prog().globalStore()) {
memList.push_back(prog().globalStore());
}
if (cpuAqlCode_->enable_sgpr_queue_ptr) {
memList.push_back(gpu.hsaQueueMem());
}
if (srdResource) {
dev().srds().fillResourceList(memList);
}
return aqlPkt;
}
} // namespace gpu